DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats
@ 2018-04-25 14:10 Anatoly Burakov
  2018-04-25 14:10 ` [dpdk-dev] [PATCH 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov
                   ` (3 more replies)
  0 siblings, 4 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-04-25 14:10 UTC (permalink / raw)
  To: dev; +Cc: shreyansh.jain

User might be interested to find out what is the biggest chunk of
IOVA-contiguous free space that can be allocated from malloc. Add
relevant malloc-internal functions and expose this through malloc
stats calculation call.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/common/include/rte_malloc.h |  1 +
 lib/librte_eal/common/malloc_elem.c        | 55 ++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_elem.h        |  3 ++
 lib/librte_eal/common/malloc_heap.c        |  8 +++++
 lib/librte_eal/common/rte_malloc.c         |  2 ++
 5 files changed, 69 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e4..2553201 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -27,6 +27,7 @@ struct rte_malloc_socket_stats {
 	size_t heap_totalsz_bytes; /**< Total bytes on heap */
 	size_t heap_freesz_bytes;  /**< Total free bytes on heap */
 	size_t greatest_free_size; /**< Size in bytes of largest free block */
+	size_t greatest_free_iova_contig_size; /**< Size in bytes of largest IOVA-contiguous block */
 	unsigned free_count;       /**< Number of free elements on heap */
 	unsigned alloc_count;      /**< Number of allocated elements on heap */
 	size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ee79dcd..415ce64 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,12 +18,67 @@
 #include <rte_common.h>
 #include <rte_spinlock.h>
 
+#include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
 #define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE)
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem)
+{
+	void *cur_page, *contig_seg_start, *page_end, *elem_end, *cur_seg_end;
+	rte_iova_t expected_iova;
+	struct rte_memseg *ms;
+	size_t page_sz, cur, max;
+
+	/* if we're in IOVA as VA mode, or if we're in legacy mode with
+	 * hugepages, all elements are IOVA-contiguous.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA ||
+			(internal_config.legacy_mem && rte_eal_has_hugepages()))
+		return elem->size;
+
+	page_sz = (size_t)elem->msl->page_sz;
+	elem_end = RTE_PTR_ADD(elem, elem->size);
+	cur_page = RTE_PTR_ALIGN_FLOOR(elem, page_sz);
+	ms = rte_mem_virt2memseg(cur_page, elem->msl);
+	contig_seg_start = elem;
+
+	/* do first iteration outside the loop */
+	page_end = RTE_PTR_ADD(cur_page, page_sz);
+	cur_seg_end = RTE_MIN(page_end, elem_end);
+	cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+	max = cur;
+	expected_iova = ms->iova + page_sz;
+	/* memsegs are contiguous in memory */
+	ms++;
+
+	cur_page = RTE_PTR_ADD(cur_page, page_sz);
+
+	while (cur_page < elem_end) {
+		page_end = RTE_PTR_ADD(cur_page, page_sz);
+		cur_seg_end = RTE_MIN(page_end, elem_end);
+
+		/* reset start of contiguous segment if unexpected iova */
+		if (ms->iova != expected_iova)
+			contig_seg_start = cur_page;
+		cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+		/* update max if cur value is bigger */
+		if (cur > max)
+			max = cur;
+
+		/* move to next page */
+		cur_page = page_end;
+		expected_iova = ms->iova + page_sz;
+		/* memsegs are contiguous in memory */
+		ms++;
+	}
+
+	return max;
+}
+
 /*
  * Initialize a general malloc_elem header structure
  */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 8f4aef8..19e8db5 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -177,4 +177,7 @@ malloc_elem_free_list_index(size_t size);
 void
 malloc_elem_free_list_insert(struct malloc_elem *elem);
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem);
+
 #endif /* MALLOC_ELEM_H_ */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 590e9e3..d8ad164 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -764,16 +764,24 @@ malloc_heap_get_stats(struct malloc_heap *heap,
 	socket_stats->free_count = 0;
 	socket_stats->heap_freesz_bytes = 0;
 	socket_stats->greatest_free_size = 0;
+	socket_stats->greatest_free_iova_contig_size = 0;
 
 	/* Iterate through free list */
 	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
 		for (elem = LIST_FIRST(&heap->free_head[idx]);
 			!!elem; elem = LIST_NEXT(elem, free_list))
 		{
+			size_t iova_contig_sz;
 			socket_stats->free_count++;
 			socket_stats->heap_freesz_bytes += elem->size;
 			if (elem->size > socket_stats->greatest_free_size)
 				socket_stats->greatest_free_size = elem->size;
+			iova_contig_sz =
+					malloc_elem_find_max_iova_contig(elem);
+			if (iova_contig_sz >
+					socket_stats->greatest_free_iova_contig_size)
+				socket_stats->greatest_free_iova_contig_size =
+						iova_contig_sz;
 		}
 	}
 	/* Get stats on overall heap and allocated memory on this heap */
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d1..b4e87a3 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -195,6 +195,8 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
 		fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
 		fprintf(f, "\tGreatest_free_size:%zu,\n",
 				sock_stats.greatest_free_size);
+		fprintf(f, "\tGreatest_free_iova_contig_size:%zu,\n",
+				sock_stats.greatest_free_iova_contig_size);
 		fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
 		fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
 	}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH 2/2] memzone: allow IOVA-contiguous memzones with zero size
  2018-04-25 14:10 [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
@ 2018-04-25 14:10 ` Anatoly Burakov
  2018-04-25 14:40 ` [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Burakov, Anatoly
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-04-25 14:10 UTC (permalink / raw)
  To: dev; +Cc: shreyansh.jain

Previously, reserving IOVA-contiguous memzones with zero
size (which would reserve biggest available memzone) was
not allowed. Now that we can have biggest IOVA-contiguous
malloc element statistic exposed through a malloc stats
call, this now becomes possible.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/common/eal_common_memzone.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index bce3321..22b60b5 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -57,7 +57,7 @@ memzone_lookup_thread_unsafe(const char *name)
  * specified. If no heap has been specified, it will return the heap and
  * length of the greatest free block available in all heaps */
 static size_t
-find_heap_max_free_elem(int *s, unsigned align)
+find_heap_max_free_elem(int *s, unsigned align, bool contig)
 {
 	struct rte_mem_config *mcfg;
 	struct rte_malloc_socket_stats stats;
@@ -68,12 +68,16 @@ find_heap_max_free_elem(int *s, unsigned align)
 	mcfg = rte_eal_get_configuration()->mem_config;
 
 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+		size_t found_len;
 		if ((socket != SOCKET_ID_ANY) && (socket != i))
 			continue;
 
 		malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats);
-		if (stats.greatest_free_size > len) {
-			len = stats.greatest_free_size;
+		found_len = contig ?
+				stats.greatest_free_iova_contig_size :
+				stats.greatest_free_size;
+		if (found_len > len) {
+			len = found_len;
 			*s = i;
 		}
 	}
@@ -166,16 +170,11 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
 	if (len == 0) {
-		/* len == 0 is only allowed for non-contiguous zones */
-		if (contig) {
-			RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
-			rte_errno = EINVAL;
-			return NULL;
-		}
 		if (bound != 0)
 			requested_len = bound;
 		else {
-			requested_len = find_heap_max_free_elem(&socket_id, align);
+			requested_len = find_heap_max_free_elem(&socket_id,
+					align, contig);
 			if (requested_len == 0) {
 				rte_errno = ENOMEM;
 				return NULL;
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats
  2018-04-25 14:10 [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
  2018-04-25 14:10 ` [dpdk-dev] [PATCH 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov
@ 2018-04-25 14:40 ` Burakov, Anatoly
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Burakov, Anatoly @ 2018-04-25 14:40 UTC (permalink / raw)
  To: dev; +Cc: shreyansh.jain

On 25-Apr-18 3:10 PM, Anatoly Burakov wrote:
> User might be interested to find out what is the biggest chunk of
> IOVA-contiguous free space that can be allocated from malloc. Add
> relevant malloc-internal functions and expose this through malloc
> stats calculation call.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---

Note: this breaks the ABI, but ABI is already broken for this release, 
so it's OK.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] malloc: add biggest free IOVA-contiguous element to stats
  2018-04-25 14:10 [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
  2018-04-25 14:10 ` [dpdk-dev] [PATCH 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov
  2018-04-25 14:40 ` [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Burakov, Anatoly
@ 2018-04-26  8:06 ` Anatoly Burakov
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
                     ` (3 more replies)
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov
  3 siblings, 4 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-04-26  8:06 UTC (permalink / raw)
  To: dev; +Cc: shreyansh.jain

User might be interested to find out what is the biggest chunk of
IOVA-contiguous free space that can be allocated from malloc. Add
relevant malloc-internal functions and expose this through malloc
stats calculation call.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v2:
    - Add header to newly recalculated element start

 lib/librte_eal/common/include/rte_malloc.h |  1 +
 lib/librte_eal/common/malloc_elem.c        | 59 ++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_elem.h        |  3 ++
 lib/librte_eal/common/malloc_heap.c        |  8 ++++
 lib/librte_eal/common/rte_malloc.c         |  2 +
 5 files changed, 73 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e4..2553201 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -27,6 +27,7 @@ struct rte_malloc_socket_stats {
 	size_t heap_totalsz_bytes; /**< Total bytes on heap */
 	size_t heap_freesz_bytes;  /**< Total free bytes on heap */
 	size_t greatest_free_size; /**< Size in bytes of largest free block */
+	size_t greatest_free_iova_contig_size; /**< Size in bytes of largest IOVA-contiguous block */
 	unsigned free_count;       /**< Number of free elements on heap */
 	unsigned alloc_count;      /**< Number of allocated elements on heap */
 	size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ee79dcd..d28ede6 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,12 +18,71 @@
 #include <rte_common.h>
 #include <rte_spinlock.h>
 
+#include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
 #define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE)
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem)
+{
+	void *cur_page, *contig_seg_start, *page_end, *elem_end, *cur_seg_end;
+	rte_iova_t expected_iova;
+	struct rte_memseg *ms;
+	size_t page_sz, cur, max;
+
+	/* if we're in IOVA as VA mode, or if we're in legacy mode with
+	 * hugepages, all elements are IOVA-contiguous.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA ||
+			(internal_config.legacy_mem && rte_eal_has_hugepages()))
+		return elem->size;
+
+	page_sz = (size_t)elem->msl->page_sz;
+	elem_end = RTE_PTR_ADD(elem, elem->size);
+	cur_page = RTE_PTR_ALIGN_FLOOR(elem, page_sz);
+	ms = rte_mem_virt2memseg(cur_page, elem->msl);
+	contig_seg_start = elem;
+
+	/* do first iteration outside the loop */
+	page_end = RTE_PTR_ADD(cur_page, page_sz);
+	cur_seg_end = RTE_MIN(page_end, elem_end);
+	cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+	max = cur;
+	expected_iova = ms->iova + page_sz;
+	/* memsegs are contiguous in memory */
+	ms++;
+
+	cur_page = RTE_PTR_ADD(cur_page, page_sz);
+
+	while (cur_page < elem_end) {
+		page_end = RTE_PTR_ADD(cur_page, page_sz);
+		cur_seg_end = RTE_MIN(page_end, elem_end);
+
+		/* reset start of contiguous segment if unexpected iova. this is
+		 * a contiguous segment, and elem->size includes header len, so
+		 * move element start point to where header would've started
+		 */
+		if (ms->iova != expected_iova)
+			contig_seg_start = RTE_PTR_SUB(cur_page,
+					MALLOC_ELEM_HEADER_LEN);
+		cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+		/* update max if cur value is bigger */
+		if (cur > max)
+			max = cur;
+
+		/* move to next page */
+		cur_page = page_end;
+		expected_iova = ms->iova + page_sz;
+		/* memsegs are contiguous in memory */
+		ms++;
+	}
+
+	return max;
+}
+
 /*
  * Initialize a general malloc_elem header structure
  */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 8f4aef8..19e8db5 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -177,4 +177,7 @@ malloc_elem_free_list_index(size_t size);
 void
 malloc_elem_free_list_insert(struct malloc_elem *elem);
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem);
+
 #endif /* MALLOC_ELEM_H_ */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 590e9e3..d8ad164 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -764,16 +764,24 @@ malloc_heap_get_stats(struct malloc_heap *heap,
 	socket_stats->free_count = 0;
 	socket_stats->heap_freesz_bytes = 0;
 	socket_stats->greatest_free_size = 0;
+	socket_stats->greatest_free_iova_contig_size = 0;
 
 	/* Iterate through free list */
 	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
 		for (elem = LIST_FIRST(&heap->free_head[idx]);
 			!!elem; elem = LIST_NEXT(elem, free_list))
 		{
+			size_t iova_contig_sz;
 			socket_stats->free_count++;
 			socket_stats->heap_freesz_bytes += elem->size;
 			if (elem->size > socket_stats->greatest_free_size)
 				socket_stats->greatest_free_size = elem->size;
+			iova_contig_sz =
+					malloc_elem_find_max_iova_contig(elem);
+			if (iova_contig_sz >
+					socket_stats->greatest_free_iova_contig_size)
+				socket_stats->greatest_free_iova_contig_size =
+						iova_contig_sz;
 		}
 	}
 	/* Get stats on overall heap and allocated memory on this heap */
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d1..b4e87a3 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -195,6 +195,8 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
 		fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
 		fprintf(f, "\tGreatest_free_size:%zu,\n",
 				sock_stats.greatest_free_size);
+		fprintf(f, "\tGreatest_free_iova_contig_size:%zu,\n",
+				sock_stats.greatest_free_iova_contig_size);
 		fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
 		fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
 	}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] memzone: allow IOVA-contiguous memzones with zero size
  2018-04-25 14:10 [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
                   ` (2 preceding siblings ...)
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
@ 2018-04-26  8:06 ` Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-04-26  8:06 UTC (permalink / raw)
  To: dev; +Cc: shreyansh.jain

Previously, reserving IOVA-contiguous memzones with zero
size (which would reserve biggest available memzone) was
not allowed. Now that we can have biggest IOVA-contiguous
malloc element statistic exposed through a malloc stats
call, this now becomes possible.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v2:
    - Fixed checkpatch warning for unsigned

 lib/librte_eal/common/eal_common_memzone.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index bce3321..9cc9961 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -57,7 +57,7 @@ memzone_lookup_thread_unsafe(const char *name)
  * specified. If no heap has been specified, it will return the heap and
  * length of the greatest free block available in all heaps */
 static size_t
-find_heap_max_free_elem(int *s, unsigned align)
+find_heap_max_free_elem(int *s, unsigned int align, bool contig)
 {
 	struct rte_mem_config *mcfg;
 	struct rte_malloc_socket_stats stats;
@@ -68,12 +68,16 @@ find_heap_max_free_elem(int *s, unsigned align)
 	mcfg = rte_eal_get_configuration()->mem_config;
 
 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+		size_t found_len;
 		if ((socket != SOCKET_ID_ANY) && (socket != i))
 			continue;
 
 		malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats);
-		if (stats.greatest_free_size > len) {
-			len = stats.greatest_free_size;
+		found_len = contig ?
+				stats.greatest_free_iova_contig_size :
+				stats.greatest_free_size;
+		if (found_len > len) {
+			len = found_len;
 			*s = i;
 		}
 	}
@@ -166,16 +170,11 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
 	if (len == 0) {
-		/* len == 0 is only allowed for non-contiguous zones */
-		if (contig) {
-			RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
-			rte_errno = EINVAL;
-			return NULL;
-		}
 		if (bound != 0)
 			requested_len = bound;
 		else {
-			requested_len = find_heap_max_free_elem(&socket_id, align);
+			requested_len = find_heap_max_free_elem(&socket_id,
+					align, contig);
 			if (requested_len == 0) {
 				rte_errno = ENOMEM;
 				return NULL;
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
@ 2018-05-03 17:17   ` Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
                       ` (3 more replies)
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
                     ` (2 subsequent siblings)
  3 siblings, 4 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-03 17:17 UTC (permalink / raw)
  To: dev

This patchset does two things. First, it enables reserving
memzones of zero-length that are IOVA-contiguous. Second,
it fixes a long-standing race condition in reserving
zero-length memzones, where malloc heap is not locked between
stats collection and reservation, and will instead allocate
biggest element on the spot.

Some limitations are added, but they are a trade-off between
not having race conditions and user convenience. It would be
possible to lock all heaps during memzone reserve for zero-
length, and that would keep the old behavior, but given how
such allocation (especially when asking for IOVA-contiguous
memory) may take a long time, a design decision was made to
keep things simple, and only check other heaps if the
current one is completely busy.

Ideas on improvement are welcome.

Anatoly Burakov (3):
  malloc: add biggest free IOVA-contiguous element to stats
  malloc: allow reserving biggest element
  memzone: improve zero-length memzone reserve

 lib/librte_eal/common/eal_common_memzone.c  |  62 ++---------
 lib/librte_eal/common/include/rte_malloc.h  |   1 +
 lib/librte_eal/common/include/rte_memzone.h |  18 ++++
 lib/librte_eal/common/malloc_elem.c         |  77 ++++++++++++++
 lib/librte_eal/common/malloc_elem.h         |   6 ++
 lib/librte_eal/common/malloc_heap.c         | 137 ++++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h         |   4 +
 lib/librte_eal/common/rte_malloc.c          |   2 +
 test/test/test_memzone.c                    | 157 +++++++++++++++-------------
 9 files changed, 342 insertions(+), 122 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v3 1/3] malloc: add biggest free IOVA-contiguous element to stats
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
@ 2018-05-03 17:17   ` Anatoly Burakov
  2018-05-10 13:39     ` Remy Horton
  2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element Anatoly Burakov
  2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 1 reply; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-03 17:17 UTC (permalink / raw)
  To: dev

User might be interested to find out what is the biggest chunk of
IOVA-contiguous free space that can be allocated from malloc. Add
relevant malloc-internal functions and expose this through malloc
stats calculation call.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v2:
    - Add header to newly recalculated element start

 lib/librte_eal/common/include/rte_malloc.h |  1 +
 lib/librte_eal/common/malloc_elem.c        | 77 ++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_elem.h        |  6 +++
 lib/librte_eal/common/malloc_heap.c        | 11 +++++
 lib/librte_eal/common/rte_malloc.c         |  2 +
 5 files changed, 97 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e4..2553201 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -27,6 +27,7 @@ struct rte_malloc_socket_stats {
 	size_t heap_totalsz_bytes; /**< Total bytes on heap */
 	size_t heap_freesz_bytes;  /**< Total free bytes on heap */
 	size_t greatest_free_size; /**< Size in bytes of largest free block */
+	size_t greatest_free_iova_contig_size; /**< Size in bytes of largest IOVA-contiguous block */
 	unsigned free_count;       /**< Number of free elements on heap */
 	unsigned alloc_count;      /**< Number of allocated elements on heap */
 	size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 9bfe9b9..d330fb1 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,10 +18,87 @@
 #include <rte_common.h>
 #include <rte_spinlock.h>
 
+#include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
+{
+	void *cur_page, *contig_seg_start, *page_end, *cur_seg_end;
+	void *data_start, *data_end;
+	rte_iova_t expected_iova;
+	struct rte_memseg *ms;
+	size_t page_sz, cur, max;
+
+	page_sz = (size_t)elem->msl->page_sz;
+	data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
+	data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN);
+	/* segment must start after header and with specified alignment */
+	contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align);
+
+	/* if we're in IOVA as VA mode, or if we're in legacy mode with
+	 * hugepages, all elements are IOVA-contiguous.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA ||
+			(internal_config.legacy_mem && rte_eal_has_hugepages()))
+		return RTE_PTR_DIFF(data_end, contig_seg_start);
+
+	cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz);
+	ms = rte_mem_virt2memseg(cur_page, elem->msl);
+
+	/* do first iteration outside the loop */
+	page_end = RTE_PTR_ADD(cur_page, page_sz);
+	cur_seg_end = RTE_MIN(page_end, data_end);
+	cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) -
+			MALLOC_ELEM_TRAILER_LEN;
+	max = cur;
+	expected_iova = ms->iova + page_sz;
+	/* memsegs are contiguous in memory */
+	ms++;
+
+	cur_page = RTE_PTR_ADD(cur_page, page_sz);
+
+	while (cur_page < data_end) {
+		page_end = RTE_PTR_ADD(cur_page, page_sz);
+		cur_seg_end = RTE_MIN(page_end, data_end);
+
+		/* reset start of contiguous segment if unexpected iova. this is
+		 * a contiguous segment, and elem->size includes header len, so
+		 * move element start point to where header would've started
+		 */
+		if (ms->iova != expected_iova) {
+			/* next contiguous segment must start at specified
+			 * alignment.
+			 */
+			contig_seg_start = RTE_PTR_ALIGN(cur_page, align);
+			/* new segment start may be on a different page, so find
+			 * the page and skip to next iteration to make sure
+			 * we're not blowing past data end.
+			 */
+			ms = rte_mem_virt2memseg(contig_seg_start, elem->msl);
+			cur_page = ms->addr;
+			/* don't trigger another recalculation */
+			expected_iova = ms->iova;
+			continue;
+		}
+		cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) -
+				MALLOC_ELEM_TRAILER_LEN;
+		/* update max if cur value is bigger */
+		if (cur > max)
+			max = cur;
+
+		/* move to next page */
+		cur_page = page_end;
+		expected_iova = ms->iova + page_sz;
+		/* memsegs are contiguous in memory */
+		ms++;
+	}
+
+	return max;
+}
+
 /*
  * Initialize a general malloc_elem header structure
  */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 7331af9..e2bda4c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -179,4 +179,10 @@ malloc_elem_free_list_index(size_t size);
 void
 malloc_elem_free_list_insert(struct malloc_elem *elem);
 
+/*
+ * Find biggest IOVA-contiguous zone within an element with specified alignment.
+ */
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align);
+
 #endif /* MALLOC_ELEM_H_ */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index d6cf3af..9305b38 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -803,16 +803,27 @@ malloc_heap_get_stats(struct malloc_heap *heap,
 	socket_stats->free_count = 0;
 	socket_stats->heap_freesz_bytes = 0;
 	socket_stats->greatest_free_size = 0;
+	socket_stats->greatest_free_iova_contig_size = 0;
 
 	/* Iterate through free list */
 	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
 		for (elem = LIST_FIRST(&heap->free_head[idx]);
 			!!elem; elem = LIST_NEXT(elem, free_list))
 		{
+			size_t iova_contig_sz;
 			socket_stats->free_count++;
 			socket_stats->heap_freesz_bytes += elem->size;
 			if (elem->size > socket_stats->greatest_free_size)
 				socket_stats->greatest_free_size = elem->size;
+			iova_contig_sz =
+					malloc_elem_find_max_iova_contig(elem,
+							RTE_CACHE_LINE_SIZE);
+			/* find_max_iova_contig doesn't include overhead */
+			iova_contig_sz += MALLOC_ELEM_OVERHEAD;
+			if (iova_contig_sz >
+					socket_stats->greatest_free_iova_contig_size)
+				socket_stats->greatest_free_iova_contig_size =
+						iova_contig_sz;
 		}
 	}
 	/* Get stats on overall heap and allocated memory on this heap */
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d1..b4e87a3 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -195,6 +195,8 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
 		fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
 		fprintf(f, "\tGreatest_free_size:%zu,\n",
 				sock_stats.greatest_free_size);
+		fprintf(f, "\tGreatest_free_iova_contig_size:%zu,\n",
+				sock_stats.greatest_free_iova_contig_size);
 		fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
 		fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
 	}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
@ 2018-05-03 17:18   ` Anatoly Burakov
  2018-05-10 13:57     ` Remy Horton
  2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 1 reply; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-03 17:18 UTC (permalink / raw)
  To: dev

Add an internal-only function to allocate biggest element from
the heap. Nominally, it supports SOCKET_ID_ANY as its socket
argument, but it's essentially useless because other sockets
will only be allocated from if the entire heap on current or
specified socket is busy.

Still, asking to reserve a biggest element will allow fixing
race condition in memzone reserve that has been there for a
long time.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/common/malloc_heap.c | 126 ++++++++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h |   4 ++
 2 files changed, 130 insertions(+)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9305b38..95c4e22 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -149,6 +149,52 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
 }
 
 /*
+ * Iterates through the freelist for a heap to find a free element with the
+ * biggest size and requested alignment. Will also set size to whatever element
+ * size that was found.
+ * Returns null on failure, or pointer to element on success.
+ */
+static struct malloc_elem *
+find_biggest_element(struct malloc_heap *heap, size_t *size,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem, *max_elem = NULL;
+	size_t idx, max_size = 0;
+
+	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
+		for (elem = LIST_FIRST(&heap->free_head[idx]);
+				!!elem; elem = LIST_NEXT(elem, free_list)) {
+			size_t cur_size;
+			if (!check_hugepage_sz(flags, elem->msl->page_sz))
+				continue;
+			if (contig) {
+				cur_size =
+					malloc_elem_find_max_iova_contig(elem,
+							align);
+			} else {
+				void *data_start = RTE_PTR_ADD(elem,
+						MALLOC_ELEM_HEADER_LEN);
+				void *data_end = RTE_PTR_ADD(elem, elem->size -
+						MALLOC_ELEM_TRAILER_LEN);
+				void *aligned = RTE_PTR_ALIGN_CEIL(data_start,
+						align);
+				/* check if aligned data start is beyond end */
+				if (aligned >= data_end)
+					continue;
+				cur_size = RTE_PTR_DIFF(data_end, aligned);
+			}
+			if (cur_size > max_size) {
+				max_size = cur_size;
+				max_elem = elem;
+			}
+		}
+	}
+
+	*size = max_size;
+	return max_elem;
+}
+
+/*
  * Main function to allocate a block of memory from the heap.
  * It locks the free list, scans it, and adds a new memseg if the
  * scan fails. Once the new memseg is added, it re-scans and should return
@@ -174,6 +220,26 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
 	return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
+static void *
+heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem;
+	size_t size;
+
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+
+	elem = find_biggest_element(heap, &size, flags, align, contig);
+	if (elem != NULL) {
+		elem = malloc_elem_alloc(elem, size, align, 0, contig);
+
+		/* increase heap's count of allocated elements */
+		heap->alloc_count++;
+	}
+
+	return elem == NULL ? NULL : (void *)(&elem[1]);
+}
+
 /* this function is exposed in malloc_mp.h */
 void
 rollback_expand_heap(struct rte_memseg **ms, int n_segs,
@@ -575,6 +641,66 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
 	return NULL;
 }
 
+static void *
+heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+	void *ret;
+
+	rte_spinlock_lock(&(heap->lock));
+
+	align = align == 0 ? 1 : align;
+
+	ret = heap_alloc_biggest(heap, type, flags, align, contig);
+
+	rte_spinlock_unlock(&(heap->lock));
+
+	return ret;
+}
+
+void *
+malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
+		size_t align, bool contig)
+{
+	int socket, i, cur_socket;
+	void *ret;
+
+	/* return NULL if align is not power-of-2 */
+	if ((align && !rte_is_power_of_2(align)))
+		return NULL;
+
+	if (!rte_eal_has_hugepages())
+		socket_arg = SOCKET_ID_ANY;
+
+	if (socket_arg == SOCKET_ID_ANY)
+		socket = malloc_get_numa_socket();
+	else
+		socket = socket_arg;
+
+	/* Check socket parameter */
+	if (socket >= RTE_MAX_NUMA_NODES)
+		return NULL;
+
+	ret = heap_alloc_biggest_on_socket(type, socket, flags, align,
+			contig);
+	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+		return ret;
+
+	/* try other heaps */
+	for (i = 0; i < (int) rte_socket_count(); i++) {
+		cur_socket = rte_socket_id_by_idx(i);
+		if (cur_socket == socket)
+			continue;
+		ret = heap_alloc_biggest_on_socket(type, cur_socket, flags,
+				align, contig);
+		if (ret != NULL)
+			return ret;
+	}
+	return NULL;
+}
+
 /* this function is exposed in malloc_mp.h */
 int
 malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 03b8014..f52cb55 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -29,6 +29,10 @@ void *
 malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
 		size_t align, size_t bound, bool contig);
 
+void *
+malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig);
+
 int
 malloc_heap_free(struct malloc_elem *elem);
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
                     ` (2 preceding siblings ...)
  2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element Anatoly Burakov
@ 2018-05-03 17:18   ` Anatoly Burakov
  2018-05-11 10:25     ` Remy Horton
  3 siblings, 1 reply; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-03 17:18 UTC (permalink / raw)
  To: dev; +Cc: sergio.gonzalez.monroy

Currently, reserving zero-length memzones is done by looking at
malloc statistics, and reserving biggest sized element found in those
statistics. This has two issues.

First, there is a race condition. The heap is unlocked between the
time we check stats, and the time we reserve malloc element for memzone.
This may lead to inability to reserve the memzone we wanted to reserve,
because another allocation might have taken place and biggest sized
element may no longer be available.

Second, the size returned by malloc statistics does not include any
alignment information, which is worked around by being conservative and
subtracting alignment length from the final result. This leads to
fragmentation and reserving memzones that could have been bigger but
aren't.

Fix all of this by using earlier-introduced operation to reserve
biggest possible malloc element. This, however, comes with a trade-off,
because we can only lock one heap at a time. So, if we check the first
available heap and find *any* element at all, that element will be
considered "the biggest", even though other heaps might have bigger
elements. We cannot know what other heaps have before we try and
allocate it, and it is not a good idea to lock all of the heaps at
the same time, so, we will just document this limitation and
encourage users to reserve memzones with socket id properly set.

Also, fixup unit tests to account for the new behavior.

Fixes: fafcc11985a2 ("mem: rework memzone to be allocated by malloc")
Cc: sergio.gonzalez.monroy@intel.com

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/common/eal_common_memzone.c  |  62 ++---------
 lib/librte_eal/common/include/rte_memzone.h |  18 ++++
 test/test/test_memzone.c                    | 157 +++++++++++++++-------------
 3 files changed, 115 insertions(+), 122 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index bce3321..18e2349 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -52,38 +52,6 @@ memzone_lookup_thread_unsafe(const char *name)
 	return NULL;
 }
 
-
-/* This function will return the greatest free block if a heap has been
- * specified. If no heap has been specified, it will return the heap and
- * length of the greatest free block available in all heaps */
-static size_t
-find_heap_max_free_elem(int *s, unsigned align)
-{
-	struct rte_mem_config *mcfg;
-	struct rte_malloc_socket_stats stats;
-	int i, socket = *s;
-	size_t len = 0;
-
-	/* get pointer to global configuration */
-	mcfg = rte_eal_get_configuration()->mem_config;
-
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		if ((socket != SOCKET_ID_ANY) && (socket != i))
-			continue;
-
-		malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats);
-		if (stats.greatest_free_size > len) {
-			len = stats.greatest_free_size;
-			*s = i;
-		}
-	}
-
-	if (len < MALLOC_ELEM_OVERHEAD + align)
-		return 0;
-
-	return len - MALLOC_ELEM_OVERHEAD - align;
-}
-
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		int socket_id, unsigned int flags, unsigned int align,
@@ -92,6 +60,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	struct rte_memzone *mz;
 	struct rte_mem_config *mcfg;
 	struct rte_fbarray *arr;
+	void *mz_addr;
 	size_t requested_len;
 	int mz_idx;
 	bool contig;
@@ -165,27 +134,16 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	/* malloc only cares about size flags, remove contig flag from flags */
 	flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
-	if (len == 0) {
-		/* len == 0 is only allowed for non-contiguous zones */
-		if (contig) {
-			RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
-			rte_errno = EINVAL;
-			return NULL;
-		}
-		if (bound != 0)
+	if (len == 0 && bound == 0) {
+		mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags,
+				align, contig);
+	} else {
+		if (len == 0)
 			requested_len = bound;
-		else {
-			requested_len = find_heap_max_free_elem(&socket_id, align);
-			if (requested_len == 0) {
-				rte_errno = ENOMEM;
-				return NULL;
-			}
-		}
+		/* allocate memory on heap */
+		mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id,
+				flags, align, bound, contig);
 	}
-
-	/* allocate memory on heap */
-	void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
-			align, bound, contig);
 	if (mz_addr == NULL) {
 		rte_errno = ENOMEM;
 		return NULL;
@@ -213,7 +171,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	snprintf(mz->name, sizeof(mz->name), "%s", name);
 	mz->iova = rte_malloc_virt2iova(mz_addr);
 	mz->addr = mz_addr;
-	mz->len = (requested_len == 0 ? elem->size : requested_len);
+	mz->len = elem->size - MALLOC_ELEM_OVERHEAD;
 	mz->hugepage_sz = elem->msl->page_sz;
 	mz->socket_id = elem->msl->socket_id;
 	mz->flags = 0;
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index 0eeb94f..beb591e 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -77,6 +77,12 @@ struct rte_memzone {
  * correctly filled memzone descriptor. If the allocation cannot be
  * done, return NULL.
  *
+ * @note: It is preferable to reserve memzones with len set to 0 and a valid
+ *   socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but will likely
+ *   not yield expected results. Specifically, the resulting memzone may not
+ *   necessarily be the biggest memzone, but rather biggest memzone on socket id
+ *   corresponding to an lcore from which reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
@@ -130,6 +136,12 @@ const struct rte_memzone *rte_memzone_reserve(const char *name,
  * descriptor. If the allocation cannot be done or if the alignment
  * is not a power of 2, returns NULL.
  *
+ * @note: It is preferable to reserve memzones with len set to 0 and a valid
+ *   socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but will likely
+ *   not yield expected results. Specifically, the resulting memzone may not
+ *   necessarily be the biggest memzone, but rather biggest memzone on socket id
+ *   corresponding to an lcore from which reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
@@ -188,6 +200,12 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name,
  * boundary. That implies that requested length should be less or equal
  * then boundary.
  *
+ * @note: It is preferable to reserve memzones with len set to 0 and a valid
+ *   socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but will likely
+ *   not yield expected results. Specifically, the resulting memzone may not
+ *   necessarily be the biggest memzone, but rather biggest memzone on socket id
+ *   corresponding to an lcore from which reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index efcf732..696a7e9 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -467,23 +467,16 @@ test_memzone_reserve_flags(void)
 
 /* Find the heap with the greatest free block size */
 static size_t
-find_max_block_free_size(const unsigned _align)
+find_max_block_free_size(unsigned align, unsigned int socket_id)
 {
 	struct rte_malloc_socket_stats stats;
-	unsigned i, align = _align;
-	size_t len = 0;
+	size_t len;
 
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		rte_malloc_get_socket_stats(i, &stats);
-		if (stats.greatest_free_size > len)
-			len = stats.greatest_free_size;
-	}
+	rte_malloc_get_socket_stats(socket_id, &stats);
 
-	if (align < RTE_CACHE_LINE_SIZE)
-		align = RTE_CACHE_LINE_ROUNDUP(align+1);
+	len = stats.greatest_free_size;
 
-	if (len <= MALLOC_ELEM_OVERHEAD + align)
-		return 0;
+	align = RTE_CACHE_LINE_ROUNDUP(align);
 
 	return len - MALLOC_ELEM_OVERHEAD - align;
 }
@@ -491,37 +484,44 @@ find_max_block_free_size(const unsigned _align)
 static int
 test_memzone_reserve_max(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen;
+	unsigned int i;
 
-	maxlen = find_max_block_free_size(0);
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen;
+		int socket;
 
-	if (maxlen == 0) {
-		printf("There is no space left!\n");
-		return 0;
-	}
+		socket = rte_socket_id_by_idx(i);
+		maxlen = find_max_block_free_size(0, socket);
 
-	mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
-			SOCKET_ID_ANY, 0);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (maxlen == 0) {
+			printf("There is no space left!\n");
+			return 0;
+		}
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size did not return bigest block\n");
-		printf("Expected size = %zu, actual size = %zu\n", maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
+				socket, 0);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
+		if (mz->len != maxlen) {
+			printf("Memzone reserve with 0 size did not return bigest block\n");
+			printf("Expected size = %zu, actual size = %zu\n",
+					maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
 	}
 
 	return 0;
@@ -530,45 +530,62 @@ test_memzone_reserve_max(void)
 static int
 test_memzone_reserve_max_aligned(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen = 0;
+	unsigned int i;
 
-	/* random alignment */
-	rte_srand((unsigned)rte_rdtsc());
-	const unsigned align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t max_len, min_len = 0;
+		int socket;
 
-	maxlen = find_max_block_free_size(align);
+		socket = rte_socket_id_by_idx(i);
 
-	if (maxlen == 0) {
-		printf("There is no space left for biggest %u-aligned memzone!\n", align);
-		return 0;
-	}
+		/* random alignment */
+		rte_srand((unsigned)rte_rdtsc());
+		const unsigned int align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
 
-	mz = rte_memzone_reserve_aligned(TEST_MEMZONE_NAME("max_zone_aligned"),
-			0, SOCKET_ID_ANY, 0, align);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		/* memzone size may be between size and size - align */
+		min_len = find_max_block_free_size(align, socket);
+		max_len = find_max_block_free_size(0, socket);
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size and alignment %u did not return"
-				" bigest block\n", align);
-		printf("Expected size = %zu, actual size = %zu\n",
-				maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (min_len == 0 || max_len == 0) {
+			printf("There is no space left for biggest %u-aligned memzone!\n",
+					align);
+			return 0;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
-	}
+		mz = rte_memzone_reserve_aligned(
+				TEST_MEMZONE_NAME("max_zone_aligned"),
+				0, socket, 0, align);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+		if (mz->addr != RTE_PTR_ALIGN(mz->addr, align)) {
+			printf("Memzone reserve with 0 size and alignment %u did not return aligned block\n",
+					align);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (mz->len < min_len || mz->len > max_len) {
+			printf("Memzone reserve with 0 size and alignment %u did not return"
+					" bigest block\n", align);
+			printf("Expected size = %zu-%zu, actual size = %zu\n",
+					min_len, max_len, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
+	}
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/3] malloc: add biggest free IOVA-contiguous element to stats
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
@ 2018-05-10 13:39     ` Remy Horton
  0 siblings, 0 replies; 32+ messages in thread
From: Remy Horton @ 2018-05-10 13:39 UTC (permalink / raw)
  To: Anatoly Burakov, dev

Logic looks ok to me.

On 03/05/2018 18:17, Anatoly Burakov wrote:
> User might be interested to find out what is the biggest chunk of
> IOVA-contiguous free space that can be allocated from malloc. Add
> relevant malloc-internal functions and expose this through malloc
> stats calculation call.
>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>

Acked-by: Remy Horton <remy.horton@intel.com>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element
  2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element Anatoly Burakov
@ 2018-05-10 13:57     ` Remy Horton
  2018-05-14  8:22       ` Burakov, Anatoly
  0 siblings, 1 reply; 32+ messages in thread
From: Remy Horton @ 2018-05-10 13:57 UTC (permalink / raw)
  To: Anatoly Burakov, dev


On 03/05/2018 18:18, Anatoly Burakov wrote:
[..]
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>

[..]
> +	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
> +		for (elem = LIST_FIRST(&heap->free_head[idx]);
> +				!!elem; elem = LIST_NEXT(elem, free_list)) {

Why the double-negation?

Otherwise, see no issues.

Acked-by: Remy Horton <remy.horton@intel.com>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
@ 2018-05-11 10:25     ` Remy Horton
  2018-05-14  8:21       ` Burakov, Anatoly
  0 siblings, 1 reply; 32+ messages in thread
From: Remy Horton @ 2018-05-11 10:25 UTC (permalink / raw)
  To: Anatoly Burakov, dev; +Cc: sergio.gonzalez.monroy


On 03/05/2018 18:18, Anatoly Burakov wrote:
[..]
> Also, fixup unit tests to account for the new behavior.

Tried running the tests but it fails on a boundary check:

RTE>>memzone_autotest
test basic memzone API
[...]
1GB Huge pages available
test alignment for memzone_reserve
check alignments and lengths
check overlapping
test boundary alignment for memzone_reserve
check_memzone_bounded(MZ_TEST_bounded_128): invalid memzone boundary 128 
crossed
Test Failed
RTE>>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-05-11 10:25     ` Remy Horton
@ 2018-05-14  8:21       ` Burakov, Anatoly
  2018-05-14 11:29         ` Burakov, Anatoly
  0 siblings, 1 reply; 32+ messages in thread
From: Burakov, Anatoly @ 2018-05-14  8:21 UTC (permalink / raw)
  To: Remy Horton, dev; +Cc: sergio.gonzalez.monroy

On 11-May-18 11:25 AM, Remy Horton wrote:
> 
> On 03/05/2018 18:18, Anatoly Burakov wrote:
> [..]
>> Also, fixup unit tests to account for the new behavior.
> 
> Tried running the tests but it fails on a boundary check:
> 
> RTE>>memzone_autotest
> test basic memzone API
> [...]
> 1GB Huge pages available
> test alignment for memzone_reserve
> check alignments and lengths
> check overlapping
> test boundary alignment for memzone_reserve
> check_memzone_bounded(MZ_TEST_bounded_128): invalid memzone boundary 128 
> crossed
> Test Failed
> RTE>>
> 

Hi Remy,

Passes here, but i'll look into it, thanks for the report.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element
  2018-05-10 13:57     ` Remy Horton
@ 2018-05-14  8:22       ` Burakov, Anatoly
  0 siblings, 0 replies; 32+ messages in thread
From: Burakov, Anatoly @ 2018-05-14  8:22 UTC (permalink / raw)
  To: Remy Horton, dev

On 10-May-18 2:57 PM, Remy Horton wrote:
> 
> On 03/05/2018 18:18, Anatoly Burakov wrote:
> [..]
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> 
> [..]
>> +    for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
>> +        for (elem = LIST_FIRST(&heap->free_head[idx]);
>> +                !!elem; elem = LIST_NEXT(elem, free_list)) {
> 
> Why the double-negation?
> 
> Otherwise, see no issues.
> 
> Acked-by: Remy Horton <remy.horton@intel.com>
> 
> 
> 

Presumably to make it a boolean value, not a pointer. This was written 
before my time, didn't change it :)

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v4 0/3] Improve zero-length memzone allocation
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
@ 2018-05-14 11:19     ` Anatoly Burakov
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
                         ` (3 more replies)
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
                       ` (2 subsequent siblings)
  3 siblings, 4 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 11:19 UTC (permalink / raw)
  To: dev; +Cc: remy.horton

This patchset does two things. First, it enables reserving
memzones of zero-length that are IOVA-contiguous. Second,
it fixes a long-standing race condition in reserving
zero-length memzones, where malloc heap is not locked between
stats collection and reservation, and will instead allocate
biggest element on the spot.

Some limitations are added, but they are a trade-off between
not having race conditions and user convenience. It would be
possible to lock all heaps during memzone reserve for zero-
length, and that would keep the old behavior, but given how
such allocation (especially when asking for IOVA-contiguous
memory) may take a long time, a design decision was made to
keep things simple, and only check other heaps if the
current one is completely busy.

Ideas on improvement are welcome.

v4:
- Fixes in memzone test
- Account for element padding
- Fix for wrong memzone size being returned
- Documentation fixes

Anatoly Burakov (3):
  malloc: add biggest free IOVA-contiguous element to stats
  malloc: allow reserving biggest element
  memzone: improve zero-length memzone reserve

 lib/librte_eal/common/eal_common_memzone.c  |  68 +++---------
 lib/librte_eal/common/include/rte_malloc.h  |   1 +
 lib/librte_eal/common/include/rte_memzone.h |  21 ++++
 lib/librte_eal/common/malloc_elem.c         |  79 +++++++++++++
 lib/librte_eal/common/malloc_elem.h         |   6 +
 lib/librte_eal/common/malloc_heap.c         | 137 +++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h         |   4 +
 lib/librte_eal/common/rte_malloc.c          |   2 +
 test/test/test_memzone.c                    | 165 ++++++++++++++++------------
 9 files changed, 358 insertions(+), 125 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v4 1/3] malloc: add biggest free IOVA-contiguous element to stats
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
@ 2018-05-14 11:19     ` Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 2/3] malloc: allow reserving biggest element Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 11:19 UTC (permalink / raw)
  To: dev; +Cc: remy.horton

User might be interested to find out what is the biggest chunk of
IOVA-contiguous free space that can be allocated from malloc. Add
relevant malloc-internal functions and expose this through malloc
stats calculation call.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---

Notes:
    v4:
    - Fix comments to be more up to date with v4 code
    - Add comments explaining trailer handling
    
    v2:
    - Add header to newly recalculated element start
    
    v2:
    - Add header to newly recalculated element start

 lib/librte_eal/common/include/rte_malloc.h |  1 +
 lib/librte_eal/common/malloc_elem.c        | 79 ++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_elem.h        |  6 +++
 lib/librte_eal/common/malloc_heap.c        | 11 +++++
 lib/librte_eal/common/rte_malloc.c         |  2 +
 5 files changed, 99 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e4..2553201 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -27,6 +27,7 @@ struct rte_malloc_socket_stats {
 	size_t heap_totalsz_bytes; /**< Total bytes on heap */
 	size_t heap_freesz_bytes;  /**< Total free bytes on heap */
 	size_t greatest_free_size; /**< Size in bytes of largest free block */
+	size_t greatest_free_iova_contig_size; /**< Size in bytes of largest IOVA-contiguous block */
 	unsigned free_count;       /**< Number of free elements on heap */
 	unsigned alloc_count;      /**< Number of allocated elements on heap */
 	size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 9bfe9b9..f1bb4fe 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,10 +18,89 @@
 #include <rte_common.h>
 #include <rte_spinlock.h>
 
+#include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
+{
+	void *cur_page, *contig_seg_start, *page_end, *cur_seg_end;
+	void *data_start, *data_end;
+	rte_iova_t expected_iova;
+	struct rte_memseg *ms;
+	size_t page_sz, cur, max;
+
+	page_sz = (size_t)elem->msl->page_sz;
+	data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
+	data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN);
+	/* segment must start after header and with specified alignment */
+	contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align);
+
+	/* if we're in IOVA as VA mode, or if we're in legacy mode with
+	 * hugepages, all elements are IOVA-contiguous.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA ||
+			(internal_config.legacy_mem && rte_eal_has_hugepages()))
+		return RTE_PTR_DIFF(data_end, contig_seg_start);
+
+	cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz);
+	ms = rte_mem_virt2memseg(cur_page, elem->msl);
+
+	/* do first iteration outside the loop */
+	page_end = RTE_PTR_ADD(cur_page, page_sz);
+	cur_seg_end = RTE_MIN(page_end, data_end);
+	cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) -
+			MALLOC_ELEM_TRAILER_LEN;
+	max = cur;
+	expected_iova = ms->iova + page_sz;
+	/* memsegs are contiguous in memory */
+	ms++;
+
+	cur_page = RTE_PTR_ADD(cur_page, page_sz);
+
+	while (cur_page < data_end) {
+		page_end = RTE_PTR_ADD(cur_page, page_sz);
+		cur_seg_end = RTE_MIN(page_end, data_end);
+
+		/* reset start of contiguous segment if unexpected iova */
+		if (ms->iova != expected_iova) {
+			/* next contiguous segment must start at specified
+			 * alignment.
+			 */
+			contig_seg_start = RTE_PTR_ALIGN(cur_page, align);
+			/* new segment start may be on a different page, so find
+			 * the page and skip to next iteration to make sure
+			 * we're not blowing past data end.
+			 */
+			ms = rte_mem_virt2memseg(contig_seg_start, elem->msl);
+			cur_page = ms->addr;
+			/* don't trigger another recalculation */
+			expected_iova = ms->iova;
+			continue;
+		}
+		/* cur_seg_end ends on a page boundary or on data end. if we're
+		 * looking at data end, then malloc trailer is already included
+		 * in the calculations. if we're looking at page end, then we
+		 * know there's more data past this page and thus there's space
+		 * for malloc element trailer, so don't count it here.
+		 */
+		cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+		/* update max if cur value is bigger */
+		if (cur > max)
+			max = cur;
+
+		/* move to next page */
+		cur_page = page_end;
+		expected_iova = ms->iova + page_sz;
+		/* memsegs are contiguous in memory */
+		ms++;
+	}
+
+	return max;
+}
+
 /*
  * Initialize a general malloc_elem header structure
  */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 7331af9..e2bda4c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -179,4 +179,10 @@ malloc_elem_free_list_index(size_t size);
 void
 malloc_elem_free_list_insert(struct malloc_elem *elem);
 
+/*
+ * Find biggest IOVA-contiguous zone within an element with specified alignment.
+ */
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align);
+
 #endif /* MALLOC_ELEM_H_ */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index d6cf3af..9305b38 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -803,16 +803,27 @@ malloc_heap_get_stats(struct malloc_heap *heap,
 	socket_stats->free_count = 0;
 	socket_stats->heap_freesz_bytes = 0;
 	socket_stats->greatest_free_size = 0;
+	socket_stats->greatest_free_iova_contig_size = 0;
 
 	/* Iterate through free list */
 	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
 		for (elem = LIST_FIRST(&heap->free_head[idx]);
 			!!elem; elem = LIST_NEXT(elem, free_list))
 		{
+			size_t iova_contig_sz;
 			socket_stats->free_count++;
 			socket_stats->heap_freesz_bytes += elem->size;
 			if (elem->size > socket_stats->greatest_free_size)
 				socket_stats->greatest_free_size = elem->size;
+			iova_contig_sz =
+					malloc_elem_find_max_iova_contig(elem,
+							RTE_CACHE_LINE_SIZE);
+			/* find_max_iova_contig doesn't include overhead */
+			iova_contig_sz += MALLOC_ELEM_OVERHEAD;
+			if (iova_contig_sz >
+					socket_stats->greatest_free_iova_contig_size)
+				socket_stats->greatest_free_iova_contig_size =
+						iova_contig_sz;
 		}
 	}
 	/* Get stats on overall heap and allocated memory on this heap */
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d1..b4e87a3 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -195,6 +195,8 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
 		fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
 		fprintf(f, "\tGreatest_free_size:%zu,\n",
 				sock_stats.greatest_free_size);
+		fprintf(f, "\tGreatest_free_iova_contig_size:%zu,\n",
+				sock_stats.greatest_free_iova_contig_size);
 		fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
 		fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
 	}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v4 2/3] malloc: allow reserving biggest element
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
@ 2018-05-14 11:19     ` Anatoly Burakov
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 11:19 UTC (permalink / raw)
  To: dev; +Cc: remy.horton

Add an internal-only function to allocate biggest element from
the heap. Nominally, it supports SOCKET_ID_ANY as its socket
argument, but it's essentially useless because other sockets
will only be allocated from if the entire heap on current or
specified socket is busy.

Still, asking to reserve a biggest element will allow fixing
race condition in memzone reserve that has been there for a
long time.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---
 lib/librte_eal/common/malloc_heap.c | 126 ++++++++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h |   4 ++
 2 files changed, 130 insertions(+)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9305b38..95c4e22 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -149,6 +149,52 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
 }
 
 /*
+ * Iterates through the freelist for a heap to find a free element with the
+ * biggest size and requested alignment. Will also set size to whatever element
+ * size that was found.
+ * Returns null on failure, or pointer to element on success.
+ */
+static struct malloc_elem *
+find_biggest_element(struct malloc_heap *heap, size_t *size,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem, *max_elem = NULL;
+	size_t idx, max_size = 0;
+
+	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
+		for (elem = LIST_FIRST(&heap->free_head[idx]);
+				!!elem; elem = LIST_NEXT(elem, free_list)) {
+			size_t cur_size;
+			if (!check_hugepage_sz(flags, elem->msl->page_sz))
+				continue;
+			if (contig) {
+				cur_size =
+					malloc_elem_find_max_iova_contig(elem,
+							align);
+			} else {
+				void *data_start = RTE_PTR_ADD(elem,
+						MALLOC_ELEM_HEADER_LEN);
+				void *data_end = RTE_PTR_ADD(elem, elem->size -
+						MALLOC_ELEM_TRAILER_LEN);
+				void *aligned = RTE_PTR_ALIGN_CEIL(data_start,
+						align);
+				/* check if aligned data start is beyond end */
+				if (aligned >= data_end)
+					continue;
+				cur_size = RTE_PTR_DIFF(data_end, aligned);
+			}
+			if (cur_size > max_size) {
+				max_size = cur_size;
+				max_elem = elem;
+			}
+		}
+	}
+
+	*size = max_size;
+	return max_elem;
+}
+
+/*
  * Main function to allocate a block of memory from the heap.
  * It locks the free list, scans it, and adds a new memseg if the
  * scan fails. Once the new memseg is added, it re-scans and should return
@@ -174,6 +220,26 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
 	return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
+static void *
+heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem;
+	size_t size;
+
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+
+	elem = find_biggest_element(heap, &size, flags, align, contig);
+	if (elem != NULL) {
+		elem = malloc_elem_alloc(elem, size, align, 0, contig);
+
+		/* increase heap's count of allocated elements */
+		heap->alloc_count++;
+	}
+
+	return elem == NULL ? NULL : (void *)(&elem[1]);
+}
+
 /* this function is exposed in malloc_mp.h */
 void
 rollback_expand_heap(struct rte_memseg **ms, int n_segs,
@@ -575,6 +641,66 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
 	return NULL;
 }
 
+static void *
+heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+	void *ret;
+
+	rte_spinlock_lock(&(heap->lock));
+
+	align = align == 0 ? 1 : align;
+
+	ret = heap_alloc_biggest(heap, type, flags, align, contig);
+
+	rte_spinlock_unlock(&(heap->lock));
+
+	return ret;
+}
+
+void *
+malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
+		size_t align, bool contig)
+{
+	int socket, i, cur_socket;
+	void *ret;
+
+	/* return NULL if align is not power-of-2 */
+	if ((align && !rte_is_power_of_2(align)))
+		return NULL;
+
+	if (!rte_eal_has_hugepages())
+		socket_arg = SOCKET_ID_ANY;
+
+	if (socket_arg == SOCKET_ID_ANY)
+		socket = malloc_get_numa_socket();
+	else
+		socket = socket_arg;
+
+	/* Check socket parameter */
+	if (socket >= RTE_MAX_NUMA_NODES)
+		return NULL;
+
+	ret = heap_alloc_biggest_on_socket(type, socket, flags, align,
+			contig);
+	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+		return ret;
+
+	/* try other heaps */
+	for (i = 0; i < (int) rte_socket_count(); i++) {
+		cur_socket = rte_socket_id_by_idx(i);
+		if (cur_socket == socket)
+			continue;
+		ret = heap_alloc_biggest_on_socket(type, cur_socket, flags,
+				align, contig);
+		if (ret != NULL)
+			return ret;
+	}
+	return NULL;
+}
+
 /* this function is exposed in malloc_mp.h */
 int
 malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 03b8014..f52cb55 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -29,6 +29,10 @@ void *
 malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
 		size_t align, size_t bound, bool contig);
 
+void *
+malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig);
+
 int
 malloc_heap_free(struct malloc_elem *elem);
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v4 3/3] memzone: improve zero-length memzone reserve
  2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
                       ` (2 preceding siblings ...)
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 2/3] malloc: allow reserving biggest element Anatoly Burakov
@ 2018-05-14 11:19     ` Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 11:19 UTC (permalink / raw)
  To: dev; +Cc: remy.horton, sergio.gonzalez.monroy

Currently, reserving zero-length memzones is done by looking at
malloc statistics, and reserving biggest sized element found in those
statistics. This has two issues.

First, there is a race condition. The heap is unlocked between the
time we check stats, and the time we reserve malloc element for memzone.
This may lead to inability to reserve the memzone we wanted to reserve,
because another allocation might have taken place and biggest sized
element may no longer be available.

Second, the size returned by malloc statistics does not include any
alignment information, which is worked around by being conservative and
subtracting alignment length from the final result. This leads to
fragmentation and reserving memzones that could have been bigger but
aren't.

Fix all of this by using earlier-introduced operation to reserve
biggest possible malloc element. This, however, comes with a trade-off,
because we can only lock one heap at a time. So, if we check the first
available heap and find *any* element at all, that element will be
considered "the biggest", even though other heaps might have bigger
elements. We cannot know what other heaps have before we try and
allocate it, and it is not a good idea to lock all of the heaps at
the same time, so, we will just document this limitation and
encourage users to reserve memzones with socket id properly set.

Also, fixup unit tests to account for the new behavior.

Fixes: fafcc11985a2 ("mem: rework memzone to be allocated by malloc")
Cc: sergio.gonzalez.monroy@intel.com

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v4:
    - Rebased on latest master
    - Improved documentation
    - Added accounting for element pad [1]
    - Fixed max len underflow in test
    - Checkpatch fixes
    
    [1] A patch has recently fixed a similar issue:
    
    https://dpdk.org/dev/patchwork/patch/39332/
    
    The accounting for padding is also needed because size of the element
    may include not only malloc header overhead, but also the padding if
    it has any.
    
    At first glance, it would seem like additional change is needed for
    pre-18.05 code as well. However, on closer inspection, the original code
    was incorrect because it was comparing requested_len to 0, which is never
    zero and is always a minimum of cache line size due to earlier RTE_MAX()
    call (or rather, it could be zero, but in that case it would fail earlier).
    This downgrades the above quoted bug from "potential memory corruption bug"
    to "this bug was never a bug due to another bug".
    
    A proper fix for pre-18.05 would be to remove the check altogether and
    always go by requested_len, which is what we use to reserve memzones
    in the first place. I will submit it separately.

 lib/librte_eal/common/eal_common_memzone.c  |  68 +++---------
 lib/librte_eal/common/include/rte_memzone.h |  21 ++++
 test/test/test_memzone.c                    | 165 ++++++++++++++++------------
 3 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index faa3b06..52f1501 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -52,38 +52,6 @@ memzone_lookup_thread_unsafe(const char *name)
 	return NULL;
 }
 
-
-/* This function will return the greatest free block if a heap has been
- * specified. If no heap has been specified, it will return the heap and
- * length of the greatest free block available in all heaps */
-static size_t
-find_heap_max_free_elem(int *s, unsigned align)
-{
-	struct rte_mem_config *mcfg;
-	struct rte_malloc_socket_stats stats;
-	int i, socket = *s;
-	size_t len = 0;
-
-	/* get pointer to global configuration */
-	mcfg = rte_eal_get_configuration()->mem_config;
-
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		if ((socket != SOCKET_ID_ANY) && (socket != i))
-			continue;
-
-		malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats);
-		if (stats.greatest_free_size > len) {
-			len = stats.greatest_free_size;
-			*s = i;
-		}
-	}
-
-	if (len < MALLOC_ELEM_OVERHEAD + align)
-		return 0;
-
-	return len - MALLOC_ELEM_OVERHEAD - align;
-}
-
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		int socket_id, unsigned int flags, unsigned int align,
@@ -92,6 +60,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	struct rte_memzone *mz;
 	struct rte_mem_config *mcfg;
 	struct rte_fbarray *arr;
+	void *mz_addr;
 	size_t requested_len;
 	int mz_idx;
 	bool contig;
@@ -140,8 +109,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		return NULL;
 	}
 
-	len += RTE_CACHE_LINE_MASK;
-	len &= ~((size_t) RTE_CACHE_LINE_MASK);
+	len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE);
 
 	/* save minimal requested  length */
 	requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE,  len);
@@ -165,27 +133,16 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	/* malloc only cares about size flags, remove contig flag from flags */
 	flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
-	if (len == 0) {
-		/* len == 0 is only allowed for non-contiguous zones */
-		if (contig) {
-			RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
-			rte_errno = EINVAL;
-			return NULL;
-		}
-		if (bound != 0)
+	if (len == 0 && bound == 0) {
+		mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags,
+				align, contig);
+	} else {
+		if (len == 0)
 			requested_len = bound;
-		else {
-			requested_len = find_heap_max_free_elem(&socket_id, align);
-			if (requested_len == 0) {
-				rte_errno = ENOMEM;
-				return NULL;
-			}
-		}
+		/* allocate memory on heap */
+		mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id,
+				flags, align, bound, contig);
 	}
-
-	/* allocate memory on heap */
-	void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
-			align, bound, contig);
 	if (mz_addr == NULL) {
 		rte_errno = ENOMEM;
 		return NULL;
@@ -213,8 +170,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	snprintf(mz->name, sizeof(mz->name), "%s", name);
 	mz->iova = rte_malloc_virt2iova(mz_addr);
 	mz->addr = mz_addr;
-	mz->len = (requested_len == 0 ?
-			(elem->size - MALLOC_ELEM_OVERHEAD) : requested_len);
+	mz->len = (len == 0 ?
+			(elem->size - elem->pad - MALLOC_ELEM_OVERHEAD) :
+			requested_len);
 	mz->hugepage_sz = elem->msl->page_sz;
 	mz->socket_id = elem->msl->socket_id;
 	mz->flags = 0;
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index a4c9bd6..f478fa9 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -81,6 +81,13 @@ struct rte_memzone {
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
@@ -138,6 +145,13 @@ const struct rte_memzone *rte_memzone_reserve(const char *name,
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
@@ -200,6 +214,13 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name,
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index efcf732..452d7cc 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -467,61 +467,69 @@ test_memzone_reserve_flags(void)
 
 /* Find the heap with the greatest free block size */
 static size_t
-find_max_block_free_size(const unsigned _align)
+find_max_block_free_size(unsigned int align, unsigned int socket_id)
 {
 	struct rte_malloc_socket_stats stats;
-	unsigned i, align = _align;
-	size_t len = 0;
+	size_t len, overhead;
 
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		rte_malloc_get_socket_stats(i, &stats);
-		if (stats.greatest_free_size > len)
-			len = stats.greatest_free_size;
-	}
+	rte_malloc_get_socket_stats(socket_id, &stats);
+
+	len = stats.greatest_free_size;
+	overhead = MALLOC_ELEM_OVERHEAD;
+
+	if (len == 0)
+		return 0;
 
-	if (align < RTE_CACHE_LINE_SIZE)
-		align = RTE_CACHE_LINE_ROUNDUP(align+1);
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+	overhead += align;
 
-	if (len <= MALLOC_ELEM_OVERHEAD + align)
+	if (len < overhead)
 		return 0;
 
-	return len - MALLOC_ELEM_OVERHEAD - align;
+	return len - overhead;
 }
 
 static int
 test_memzone_reserve_max(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen;
+	unsigned int i;
 
-	maxlen = find_max_block_free_size(0);
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen;
+		int socket;
 
-	if (maxlen == 0) {
-		printf("There is no space left!\n");
-		return 0;
-	}
+		socket = rte_socket_id_by_idx(i);
+		maxlen = find_max_block_free_size(0, socket);
 
-	mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
-			SOCKET_ID_ANY, 0);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (maxlen == 0) {
+			printf("There is no space left!\n");
+			return 0;
+		}
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size did not return bigest block\n");
-		printf("Expected size = %zu, actual size = %zu\n", maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
+				socket, 0);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
+		if (mz->len != maxlen) {
+			printf("Memzone reserve with 0 size did not return bigest block\n");
+			printf("Expected size = %zu, actual size = %zu\n",
+					maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
 	}
 
 	return 0;
@@ -530,45 +538,62 @@ test_memzone_reserve_max(void)
 static int
 test_memzone_reserve_max_aligned(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen = 0;
+	unsigned int i;
 
-	/* random alignment */
-	rte_srand((unsigned)rte_rdtsc());
-	const unsigned align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen, minlen = 0;
+		int socket;
 
-	maxlen = find_max_block_free_size(align);
+		socket = rte_socket_id_by_idx(i);
 
-	if (maxlen == 0) {
-		printf("There is no space left for biggest %u-aligned memzone!\n", align);
-		return 0;
-	}
+		/* random alignment */
+		rte_srand((unsigned int)rte_rdtsc());
+		const unsigned int align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
 
-	mz = rte_memzone_reserve_aligned(TEST_MEMZONE_NAME("max_zone_aligned"),
-			0, SOCKET_ID_ANY, 0, align);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		/* memzone size may be between size and size - align */
+		minlen = find_max_block_free_size(align, socket);
+		maxlen = find_max_block_free_size(0, socket);
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size and alignment %u did not return"
-				" bigest block\n", align);
-		printf("Expected size = %zu, actual size = %zu\n",
-				maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (minlen == 0 || maxlen == 0) {
+			printf("There is no space left for biggest %u-aligned memzone!\n",
+					align);
+			return 0;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
-	}
+		mz = rte_memzone_reserve_aligned(
+				TEST_MEMZONE_NAME("max_zone_aligned"),
+				0, socket, 0, align);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+		if (mz->addr != RTE_PTR_ALIGN(mz->addr, align)) {
+			printf("Memzone reserve with 0 size and alignment %u did not return aligned block\n",
+					align);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
+		if (mz->len < minlen || mz->len > maxlen) {
+			printf("Memzone reserve with 0 size and alignment %u did not return"
+					" bigest block\n", align);
+			printf("Expected size = %zu-%zu, actual size = %zu\n",
+					minlen, maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
+	}
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-05-14  8:21       ` Burakov, Anatoly
@ 2018-05-14 11:29         ` Burakov, Anatoly
  2018-05-14 12:23           ` Burakov, Anatoly
  2018-05-15  6:24           ` Remy Horton
  0 siblings, 2 replies; 32+ messages in thread
From: Burakov, Anatoly @ 2018-05-14 11:29 UTC (permalink / raw)
  To: Remy Horton, dev; +Cc: sergio.gonzalez.monroy

On 14-May-18 9:21 AM, Burakov, Anatoly wrote:
> On 11-May-18 11:25 AM, Remy Horton wrote:
>>
>> On 03/05/2018 18:18, Anatoly Burakov wrote:
>> [..]
>>> Also, fixup unit tests to account for the new behavior.
>>
>> Tried running the tests but it fails on a boundary check:
>>
>> RTE>>memzone_autotest
>> test basic memzone API
>> [...]
>> 1GB Huge pages available
>> test alignment for memzone_reserve
>> check alignments and lengths
>> check overlapping
>> test boundary alignment for memzone_reserve
>> check_memzone_bounded(MZ_TEST_bounded_128): invalid memzone boundary 
>> 128 crossed
>> Test Failed
>> RTE>>
>>
> 
> Hi Remy,
> 
> Passes here, but i'll look into it, thanks for the report.
> 

This failure is not caused by this patchset, and you should get similar 
failures on master if you get these while testing my patchset. I am not 
able to reproduce this issue, but i'll double-check the bounded reserve 
code with a fine-toothed comb anyway.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-05-14 11:29         ` Burakov, Anatoly
@ 2018-05-14 12:23           ` Burakov, Anatoly
  2018-05-15  6:24           ` Remy Horton
  1 sibling, 0 replies; 32+ messages in thread
From: Burakov, Anatoly @ 2018-05-14 12:23 UTC (permalink / raw)
  To: Remy Horton, dev; +Cc: sergio.gonzalez.monroy

On 14-May-18 12:29 PM, Burakov, Anatoly wrote:
> On 14-May-18 9:21 AM, Burakov, Anatoly wrote:
>> On 11-May-18 11:25 AM, Remy Horton wrote:
>>>
>>> On 03/05/2018 18:18, Anatoly Burakov wrote:
>>> [..]
>>>> Also, fixup unit tests to account for the new behavior.
>>>
>>> Tried running the tests but it fails on a boundary check:
>>>
>>> RTE>>memzone_autotest
>>> test basic memzone API
>>> [...]
>>> 1GB Huge pages available
>>> test alignment for memzone_reserve
>>> check alignments and lengths
>>> check overlapping
>>> test boundary alignment for memzone_reserve
>>> check_memzone_bounded(MZ_TEST_bounded_128): invalid memzone boundary 
>>> 128 crossed
>>> Test Failed
>>> RTE>>
>>>
>>
>> Hi Remy,
>>
>> Passes here, but i'll look into it, thanks for the report.
>>
> 
> This failure is not caused by this patchset, and you should get similar 
> failures on master if you get these while testing my patchset. I am not 
> able to reproduce this issue, but i'll double-check the bounded reserve 
> code with a fine-toothed comb anyway.
> 
Further investigation showed that this is indeed related to the new 
code. So, v5 has a bug too, i'll fix it for v6.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v5 0/3] Improve zero-length memzone allocation
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
@ 2018-05-14 13:47       ` Anatoly Burakov
  2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 " Anatoly Burakov
                           ` (3 more replies)
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
                         ` (2 subsequent siblings)
  3 siblings, 4 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 13:47 UTC (permalink / raw)
  To: dev; +Cc: remy.horton

This patchset does two things. First, it enables reserving
memzones of zero-length that are IOVA-contiguous. Second,
it fixes a long-standing race condition in reserving
zero-length memzones, where malloc heap is not locked between
stats collection and reservation, and will instead allocate
biggest element on the spot.

Some limitations are added, but they are a trade-off between
not having race conditions and user convenience. It would be
possible to lock all heaps during memzone reserve for zero-
length, and that would keep the old behavior, but given how
such allocation (especially when asking for IOVA-contiguous
memory) may take a long time, a design decision was made to
keep things simple, and only check other heaps if the
current one is completely busy.

Ideas on improvement are welcome.

v5:
- Use bound length if reserving memzone with zero length

v4:
- Fixes in memzone test
- Account for element padding
- Fix for wrong memzone size being returned
- Documentation fixes

Anatoly Burakov (3):
  malloc: add biggest free IOVA-contiguous element to stats
  malloc: allow reserving biggest element
  memzone: improve zero-length memzone reserve

 lib/librte_eal/common/eal_common_memzone.c  |  70 +++---------
 lib/librte_eal/common/include/rte_malloc.h  |   1 +
 lib/librte_eal/common/include/rte_memzone.h |  21 ++++
 lib/librte_eal/common/malloc_elem.c         |  79 +++++++++++++
 lib/librte_eal/common/malloc_elem.h         |   6 +
 lib/librte_eal/common/malloc_heap.c         | 137 +++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h         |   4 +
 lib/librte_eal/common/rte_malloc.c          |   2 +
 test/test/test_memzone.c                    | 165 ++++++++++++++++------------
 9 files changed, 360 insertions(+), 125 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-contiguous element to stats
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
@ 2018-05-14 13:47       ` Anatoly Burakov
  2018-05-14 13:58         ` Shreyansh Jain
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 2/3] malloc: allow reserving biggest element Anatoly Burakov
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 1 reply; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 13:47 UTC (permalink / raw)
  To: dev; +Cc: remy.horton

User might be interested to find out what is the biggest chunk of
IOVA-contiguous free space that can be allocated from malloc. Add
relevant malloc-internal functions and expose this through malloc
stats calculation call.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---

Notes:
    v4:
    - Fix comments to be more up to date with v4 code
    - Add comments explaining trailer handling
    
    v2:
    - Add header to newly recalculated element start
    
    v2:
    - Add header to newly recalculated element start

 lib/librte_eal/common/include/rte_malloc.h |  1 +
 lib/librte_eal/common/malloc_elem.c        | 79 ++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_elem.h        |  6 +++
 lib/librte_eal/common/malloc_heap.c        | 11 +++++
 lib/librte_eal/common/rte_malloc.c         |  2 +
 5 files changed, 99 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h
index a9fb7e4..2553201 100644
--- a/lib/librte_eal/common/include/rte_malloc.h
+++ b/lib/librte_eal/common/include/rte_malloc.h
@@ -27,6 +27,7 @@ struct rte_malloc_socket_stats {
 	size_t heap_totalsz_bytes; /**< Total bytes on heap */
 	size_t heap_freesz_bytes;  /**< Total free bytes on heap */
 	size_t greatest_free_size; /**< Size in bytes of largest free block */
+	size_t greatest_free_iova_contig_size; /**< Size in bytes of largest IOVA-contiguous block */
 	unsigned free_count;       /**< Number of free elements on heap */
 	unsigned alloc_count;      /**< Number of allocated elements on heap */
 	size_t heap_allocsz_bytes; /**< Total allocated bytes on heap */
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 9bfe9b9..f1bb4fe 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,10 +18,89 @@
 #include <rte_common.h>
 #include <rte_spinlock.h>
 
+#include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
+{
+	void *cur_page, *contig_seg_start, *page_end, *cur_seg_end;
+	void *data_start, *data_end;
+	rte_iova_t expected_iova;
+	struct rte_memseg *ms;
+	size_t page_sz, cur, max;
+
+	page_sz = (size_t)elem->msl->page_sz;
+	data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
+	data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN);
+	/* segment must start after header and with specified alignment */
+	contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align);
+
+	/* if we're in IOVA as VA mode, or if we're in legacy mode with
+	 * hugepages, all elements are IOVA-contiguous.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA ||
+			(internal_config.legacy_mem && rte_eal_has_hugepages()))
+		return RTE_PTR_DIFF(data_end, contig_seg_start);
+
+	cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz);
+	ms = rte_mem_virt2memseg(cur_page, elem->msl);
+
+	/* do first iteration outside the loop */
+	page_end = RTE_PTR_ADD(cur_page, page_sz);
+	cur_seg_end = RTE_MIN(page_end, data_end);
+	cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) -
+			MALLOC_ELEM_TRAILER_LEN;
+	max = cur;
+	expected_iova = ms->iova + page_sz;
+	/* memsegs are contiguous in memory */
+	ms++;
+
+	cur_page = RTE_PTR_ADD(cur_page, page_sz);
+
+	while (cur_page < data_end) {
+		page_end = RTE_PTR_ADD(cur_page, page_sz);
+		cur_seg_end = RTE_MIN(page_end, data_end);
+
+		/* reset start of contiguous segment if unexpected iova */
+		if (ms->iova != expected_iova) {
+			/* next contiguous segment must start at specified
+			 * alignment.
+			 */
+			contig_seg_start = RTE_PTR_ALIGN(cur_page, align);
+			/* new segment start may be on a different page, so find
+			 * the page and skip to next iteration to make sure
+			 * we're not blowing past data end.
+			 */
+			ms = rte_mem_virt2memseg(contig_seg_start, elem->msl);
+			cur_page = ms->addr;
+			/* don't trigger another recalculation */
+			expected_iova = ms->iova;
+			continue;
+		}
+		/* cur_seg_end ends on a page boundary or on data end. if we're
+		 * looking at data end, then malloc trailer is already included
+		 * in the calculations. if we're looking at page end, then we
+		 * know there's more data past this page and thus there's space
+		 * for malloc element trailer, so don't count it here.
+		 */
+		cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+		/* update max if cur value is bigger */
+		if (cur > max)
+			max = cur;
+
+		/* move to next page */
+		cur_page = page_end;
+		expected_iova = ms->iova + page_sz;
+		/* memsegs are contiguous in memory */
+		ms++;
+	}
+
+	return max;
+}
+
 /*
  * Initialize a general malloc_elem header structure
  */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 7331af9..e2bda4c 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -179,4 +179,10 @@ malloc_elem_free_list_index(size_t size);
 void
 malloc_elem_free_list_insert(struct malloc_elem *elem);
 
+/*
+ * Find biggest IOVA-contiguous zone within an element with specified alignment.
+ */
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align);
+
 #endif /* MALLOC_ELEM_H_ */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index d6cf3af..9305b38 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -803,16 +803,27 @@ malloc_heap_get_stats(struct malloc_heap *heap,
 	socket_stats->free_count = 0;
 	socket_stats->heap_freesz_bytes = 0;
 	socket_stats->greatest_free_size = 0;
+	socket_stats->greatest_free_iova_contig_size = 0;
 
 	/* Iterate through free list */
 	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
 		for (elem = LIST_FIRST(&heap->free_head[idx]);
 			!!elem; elem = LIST_NEXT(elem, free_list))
 		{
+			size_t iova_contig_sz;
 			socket_stats->free_count++;
 			socket_stats->heap_freesz_bytes += elem->size;
 			if (elem->size > socket_stats->greatest_free_size)
 				socket_stats->greatest_free_size = elem->size;
+			iova_contig_sz =
+					malloc_elem_find_max_iova_contig(elem,
+							RTE_CACHE_LINE_SIZE);
+			/* find_max_iova_contig doesn't include overhead */
+			iova_contig_sz += MALLOC_ELEM_OVERHEAD;
+			if (iova_contig_sz >
+					socket_stats->greatest_free_iova_contig_size)
+				socket_stats->greatest_free_iova_contig_size =
+						iova_contig_sz;
 		}
 	}
 	/* Get stats on overall heap and allocated memory on this heap */
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index b51a6d1..b4e87a3 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -195,6 +195,8 @@ rte_malloc_dump_stats(FILE *f, __rte_unused const char *type)
 		fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes);
 		fprintf(f, "\tGreatest_free_size:%zu,\n",
 				sock_stats.greatest_free_size);
+		fprintf(f, "\tGreatest_free_iova_contig_size:%zu,\n",
+				sock_stats.greatest_free_iova_contig_size);
 		fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count);
 		fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count);
 	}
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v5 2/3] malloc: allow reserving biggest element
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
@ 2018-05-14 13:47       ` Anatoly Burakov
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 13:47 UTC (permalink / raw)
  To: dev; +Cc: remy.horton

Add an internal-only function to allocate biggest element from
the heap. Nominally, it supports SOCKET_ID_ANY as its socket
argument, but it's essentially useless because other sockets
will only be allocated from if the entire heap on current or
specified socket is busy.

Still, asking to reserve a biggest element will allow fixing
race condition in memzone reserve that has been there for a
long time.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---
 lib/librte_eal/common/malloc_heap.c | 126 ++++++++++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h |   4 ++
 2 files changed, 130 insertions(+)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 9305b38..95c4e22 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -149,6 +149,52 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
 }
 
 /*
+ * Iterates through the freelist for a heap to find a free element with the
+ * biggest size and requested alignment. Will also set size to whatever element
+ * size that was found.
+ * Returns null on failure, or pointer to element on success.
+ */
+static struct malloc_elem *
+find_biggest_element(struct malloc_heap *heap, size_t *size,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem, *max_elem = NULL;
+	size_t idx, max_size = 0;
+
+	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
+		for (elem = LIST_FIRST(&heap->free_head[idx]);
+				!!elem; elem = LIST_NEXT(elem, free_list)) {
+			size_t cur_size;
+			if (!check_hugepage_sz(flags, elem->msl->page_sz))
+				continue;
+			if (contig) {
+				cur_size =
+					malloc_elem_find_max_iova_contig(elem,
+							align);
+			} else {
+				void *data_start = RTE_PTR_ADD(elem,
+						MALLOC_ELEM_HEADER_LEN);
+				void *data_end = RTE_PTR_ADD(elem, elem->size -
+						MALLOC_ELEM_TRAILER_LEN);
+				void *aligned = RTE_PTR_ALIGN_CEIL(data_start,
+						align);
+				/* check if aligned data start is beyond end */
+				if (aligned >= data_end)
+					continue;
+				cur_size = RTE_PTR_DIFF(data_end, aligned);
+			}
+			if (cur_size > max_size) {
+				max_size = cur_size;
+				max_elem = elem;
+			}
+		}
+	}
+
+	*size = max_size;
+	return max_elem;
+}
+
+/*
  * Main function to allocate a block of memory from the heap.
  * It locks the free list, scans it, and adds a new memseg if the
  * scan fails. Once the new memseg is added, it re-scans and should return
@@ -174,6 +220,26 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
 	return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
+static void *
+heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem;
+	size_t size;
+
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+
+	elem = find_biggest_element(heap, &size, flags, align, contig);
+	if (elem != NULL) {
+		elem = malloc_elem_alloc(elem, size, align, 0, contig);
+
+		/* increase heap's count of allocated elements */
+		heap->alloc_count++;
+	}
+
+	return elem == NULL ? NULL : (void *)(&elem[1]);
+}
+
 /* this function is exposed in malloc_mp.h */
 void
 rollback_expand_heap(struct rte_memseg **ms, int n_segs,
@@ -575,6 +641,66 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
 	return NULL;
 }
 
+static void *
+heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+	void *ret;
+
+	rte_spinlock_lock(&(heap->lock));
+
+	align = align == 0 ? 1 : align;
+
+	ret = heap_alloc_biggest(heap, type, flags, align, contig);
+
+	rte_spinlock_unlock(&(heap->lock));
+
+	return ret;
+}
+
+void *
+malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
+		size_t align, bool contig)
+{
+	int socket, i, cur_socket;
+	void *ret;
+
+	/* return NULL if align is not power-of-2 */
+	if ((align && !rte_is_power_of_2(align)))
+		return NULL;
+
+	if (!rte_eal_has_hugepages())
+		socket_arg = SOCKET_ID_ANY;
+
+	if (socket_arg == SOCKET_ID_ANY)
+		socket = malloc_get_numa_socket();
+	else
+		socket = socket_arg;
+
+	/* Check socket parameter */
+	if (socket >= RTE_MAX_NUMA_NODES)
+		return NULL;
+
+	ret = heap_alloc_biggest_on_socket(type, socket, flags, align,
+			contig);
+	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+		return ret;
+
+	/* try other heaps */
+	for (i = 0; i < (int) rte_socket_count(); i++) {
+		cur_socket = rte_socket_id_by_idx(i);
+		if (cur_socket == socket)
+			continue;
+		ret = heap_alloc_biggest_on_socket(type, cur_socket, flags,
+				align, contig);
+		if (ret != NULL)
+			return ret;
+	}
+	return NULL;
+}
+
 /* this function is exposed in malloc_mp.h */
 int
 malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 03b8014..f52cb55 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -29,6 +29,10 @@ void *
 malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
 		size_t align, size_t bound, bool contig);
 
+void *
+malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig);
+
 int
 malloc_heap_free(struct malloc_elem *elem);
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v5 3/3] memzone: improve zero-length memzone reserve
  2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
                         ` (2 preceding siblings ...)
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 2/3] malloc: allow reserving biggest element Anatoly Burakov
@ 2018-05-14 13:47       ` Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-14 13:47 UTC (permalink / raw)
  To: dev; +Cc: remy.horton, sergio.gonzalez.monroy

Currently, reserving zero-length memzones is done by looking at
malloc statistics, and reserving biggest sized element found in those
statistics. This has two issues.

First, there is a race condition. The heap is unlocked between the
time we check stats, and the time we reserve malloc element for memzone.
This may lead to inability to reserve the memzone we wanted to reserve,
because another allocation might have taken place and biggest sized
element may no longer be available.

Second, the size returned by malloc statistics does not include any
alignment information, which is worked around by being conservative and
subtracting alignment length from the final result. This leads to
fragmentation and reserving memzones that could have been bigger but
aren't.

Fix all of this by using earlier-introduced operation to reserve
biggest possible malloc element. This, however, comes with a trade-off,
because we can only lock one heap at a time. So, if we check the first
available heap and find *any* element at all, that element will be
considered "the biggest", even though other heaps might have bigger
elements. We cannot know what other heaps have before we try and
allocate it, and it is not a good idea to lock all of the heaps at
the same time, so, we will just document this limitation and
encourage users to reserve memzones with socket id properly set.

Also, fixup unit tests to account for the new behavior.

Fixes: fafcc11985a2 ("mem: rework memzone to be allocated by malloc")
Cc: sergio.gonzalez.monroy@intel.com

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v5:
    - Use bound len when reserving bounded zero-length memzones
    
    v4:
    - Rebased on latest master
    - Improved documentation
    - Added accounting for element pad [1]
    - Fixed max len underflow in test
    - Checkpatch fixes
    
    [1] A patch has recently fixed a similar issue:
    
    https://dpdk.org/dev/patchwork/patch/39332/
    
    The accounting for padding is also needed because size of the element
    may include not only malloc header overhead, but also the padding if
    it has any.
    
    At first glance, it would seem like additional change is needed for
    pre-18.05 code as well. However, on closer inspection, the original code
    was incorrect because it was comparing requested_len to 0, which is never
    zero and is always a minimum of cache line size due to earlier RTE_MAX()
    call (or rather, it could be zero, but in that case it would fail earlier).
    This downgrades the above quoted bug from "potential memory corruption bug"
    to "this bug was never a bug due to another bug".
    
    A proper fix for pre-18.05 would be to remove the check altogether and
    always go by requested_len, which is what we use to reserve memzones
    in the first place. I will submit it separately.

 lib/librte_eal/common/eal_common_memzone.c  |  70 +++---------
 lib/librte_eal/common/include/rte_memzone.h |  21 ++++
 test/test/test_memzone.c                    | 165 ++++++++++++++++------------
 3 files changed, 131 insertions(+), 125 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index faa3b06..7300fe0 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -52,38 +52,6 @@ memzone_lookup_thread_unsafe(const char *name)
 	return NULL;
 }
 
-
-/* This function will return the greatest free block if a heap has been
- * specified. If no heap has been specified, it will return the heap and
- * length of the greatest free block available in all heaps */
-static size_t
-find_heap_max_free_elem(int *s, unsigned align)
-{
-	struct rte_mem_config *mcfg;
-	struct rte_malloc_socket_stats stats;
-	int i, socket = *s;
-	size_t len = 0;
-
-	/* get pointer to global configuration */
-	mcfg = rte_eal_get_configuration()->mem_config;
-
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		if ((socket != SOCKET_ID_ANY) && (socket != i))
-			continue;
-
-		malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats);
-		if (stats.greatest_free_size > len) {
-			len = stats.greatest_free_size;
-			*s = i;
-		}
-	}
-
-	if (len < MALLOC_ELEM_OVERHEAD + align)
-		return 0;
-
-	return len - MALLOC_ELEM_OVERHEAD - align;
-}
-
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		int socket_id, unsigned int flags, unsigned int align,
@@ -92,6 +60,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	struct rte_memzone *mz;
 	struct rte_mem_config *mcfg;
 	struct rte_fbarray *arr;
+	void *mz_addr;
 	size_t requested_len;
 	int mz_idx;
 	bool contig;
@@ -140,8 +109,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		return NULL;
 	}
 
-	len += RTE_CACHE_LINE_MASK;
-	len &= ~((size_t) RTE_CACHE_LINE_MASK);
+	len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE);
 
 	/* save minimal requested  length */
 	requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE,  len);
@@ -165,27 +133,18 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	/* malloc only cares about size flags, remove contig flag from flags */
 	flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
-	if (len == 0) {
-		/* len == 0 is only allowed for non-contiguous zones */
-		if (contig) {
-			RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
-			rte_errno = EINVAL;
-			return NULL;
-		}
-		if (bound != 0)
+	if (len == 0 && bound == 0) {
+		/* no size constraints were placed, so use malloc elem len */
+		requested_len = 0;
+		mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags,
+				align, contig);
+	} else {
+		if (len == 0)
 			requested_len = bound;
-		else {
-			requested_len = find_heap_max_free_elem(&socket_id, align);
-			if (requested_len == 0) {
-				rte_errno = ENOMEM;
-				return NULL;
-			}
-		}
+		/* allocate memory on heap */
+		mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id,
+				flags, align, bound, contig);
 	}
-
-	/* allocate memory on heap */
-	void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
-			align, bound, contig);
 	if (mz_addr == NULL) {
 		rte_errno = ENOMEM;
 		return NULL;
@@ -213,8 +172,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	snprintf(mz->name, sizeof(mz->name), "%s", name);
 	mz->iova = rte_malloc_virt2iova(mz_addr);
 	mz->addr = mz_addr;
-	mz->len = (requested_len == 0 ?
-			(elem->size - MALLOC_ELEM_OVERHEAD) : requested_len);
+	mz->len = requested_len == 0 ?
+			elem->size - elem->pad - MALLOC_ELEM_OVERHEAD :
+			requested_len;
 	mz->hugepage_sz = elem->msl->page_sz;
 	mz->socket_id = elem->msl->socket_id;
 	mz->flags = 0;
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index a4c9bd6..f478fa9 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -81,6 +81,13 @@ struct rte_memzone {
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
@@ -138,6 +145,13 @@ const struct rte_memzone *rte_memzone_reserve(const char *name,
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
@@ -200,6 +214,13 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name,
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
+ *
  * @param name
  *   The name of the memzone. If it already exists, the function will
  *   fail and return NULL.
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index efcf732..452d7cc 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -467,61 +467,69 @@ test_memzone_reserve_flags(void)
 
 /* Find the heap with the greatest free block size */
 static size_t
-find_max_block_free_size(const unsigned _align)
+find_max_block_free_size(unsigned int align, unsigned int socket_id)
 {
 	struct rte_malloc_socket_stats stats;
-	unsigned i, align = _align;
-	size_t len = 0;
+	size_t len, overhead;
 
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		rte_malloc_get_socket_stats(i, &stats);
-		if (stats.greatest_free_size > len)
-			len = stats.greatest_free_size;
-	}
+	rte_malloc_get_socket_stats(socket_id, &stats);
+
+	len = stats.greatest_free_size;
+	overhead = MALLOC_ELEM_OVERHEAD;
+
+	if (len == 0)
+		return 0;
 
-	if (align < RTE_CACHE_LINE_SIZE)
-		align = RTE_CACHE_LINE_ROUNDUP(align+1);
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+	overhead += align;
 
-	if (len <= MALLOC_ELEM_OVERHEAD + align)
+	if (len < overhead)
 		return 0;
 
-	return len - MALLOC_ELEM_OVERHEAD - align;
+	return len - overhead;
 }
 
 static int
 test_memzone_reserve_max(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen;
+	unsigned int i;
 
-	maxlen = find_max_block_free_size(0);
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen;
+		int socket;
 
-	if (maxlen == 0) {
-		printf("There is no space left!\n");
-		return 0;
-	}
+		socket = rte_socket_id_by_idx(i);
+		maxlen = find_max_block_free_size(0, socket);
 
-	mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
-			SOCKET_ID_ANY, 0);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (maxlen == 0) {
+			printf("There is no space left!\n");
+			return 0;
+		}
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size did not return bigest block\n");
-		printf("Expected size = %zu, actual size = %zu\n", maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
+				socket, 0);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
+		if (mz->len != maxlen) {
+			printf("Memzone reserve with 0 size did not return bigest block\n");
+			printf("Expected size = %zu, actual size = %zu\n",
+					maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
 	}
 
 	return 0;
@@ -530,45 +538,62 @@ test_memzone_reserve_max(void)
 static int
 test_memzone_reserve_max_aligned(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen = 0;
+	unsigned int i;
 
-	/* random alignment */
-	rte_srand((unsigned)rte_rdtsc());
-	const unsigned align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen, minlen = 0;
+		int socket;
 
-	maxlen = find_max_block_free_size(align);
+		socket = rte_socket_id_by_idx(i);
 
-	if (maxlen == 0) {
-		printf("There is no space left for biggest %u-aligned memzone!\n", align);
-		return 0;
-	}
+		/* random alignment */
+		rte_srand((unsigned int)rte_rdtsc());
+		const unsigned int align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
 
-	mz = rte_memzone_reserve_aligned(TEST_MEMZONE_NAME("max_zone_aligned"),
-			0, SOCKET_ID_ANY, 0, align);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		/* memzone size may be between size and size - align */
+		minlen = find_max_block_free_size(align, socket);
+		maxlen = find_max_block_free_size(0, socket);
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size and alignment %u did not return"
-				" bigest block\n", align);
-		printf("Expected size = %zu, actual size = %zu\n",
-				maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (minlen == 0 || maxlen == 0) {
+			printf("There is no space left for biggest %u-aligned memzone!\n",
+					align);
+			return 0;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
-	}
+		mz = rte_memzone_reserve_aligned(
+				TEST_MEMZONE_NAME("max_zone_aligned"),
+				0, socket, 0, align);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+		if (mz->addr != RTE_PTR_ALIGN(mz->addr, align)) {
+			printf("Memzone reserve with 0 size and alignment %u did not return aligned block\n",
+					align);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
+		if (mz->len < minlen || mz->len > maxlen) {
+			printf("Memzone reserve with 0 size and alignment %u did not return"
+					" bigest block\n", align);
+			printf("Expected size = %zu-%zu, actual size = %zu\n",
+					minlen, maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
+	}
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-contiguous element to stats
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
@ 2018-05-14 13:58         ` Shreyansh Jain
  0 siblings, 0 replies; 32+ messages in thread
From: Shreyansh Jain @ 2018-05-14 13:58 UTC (permalink / raw)
  To: Anatoly Burakov, dev; +Cc: remy.horton

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Anatoly Burakov
> Sent: Monday, May 14, 2018 7:17 PM
> To: dev@dpdk.org
> Cc: remy.horton@intel.com
> Subject: [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-
> contiguous element to stats
> 
> User might be interested to find out what is the biggest chunk of
> IOVA-contiguous free space that can be allocated from malloc. Add
> relevant malloc-internal functions and expose this through malloc
> stats calculation call.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Acked-by: Remy Horton <remy.horton@intel.com>
> ---
> 

Acked-by: Shreyansh Jain <Shreyansh.jain@nxp.com>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-05-14 11:29         ` Burakov, Anatoly
  2018-05-14 12:23           ` Burakov, Anatoly
@ 2018-05-15  6:24           ` Remy Horton
  2018-05-15  8:37             ` Burakov, Anatoly
  1 sibling, 1 reply; 32+ messages in thread
From: Remy Horton @ 2018-05-15  6:24 UTC (permalink / raw)
  To: Burakov, Anatoly, dev; +Cc: sergio.gonzalez.monroy


On 14/05/2018 12:29, Burakov, Anatoly wrote:
[..]
> This failure is not caused by this patchset, and you should get similar
> failures on master if you get these while testing my patchset. I am not
> able to reproduce this issue, but i'll double-check the bounded reserve
> code with a fine-toothed comb anyway.

I reliably get the failure with V3 applied to RC2, but so far haven't 
been able to replicate with any other combination (clean RC2, RC2+V5, 
master+V4, etc). Odd..

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve
  2018-05-15  6:24           ` Remy Horton
@ 2018-05-15  8:37             ` Burakov, Anatoly
  0 siblings, 0 replies; 32+ messages in thread
From: Burakov, Anatoly @ 2018-05-15  8:37 UTC (permalink / raw)
  To: Remy Horton, dev; +Cc: sergio.gonzalez.monroy

On 15-May-18 7:24 AM, Remy Horton wrote:
> 
> On 14/05/2018 12:29, Burakov, Anatoly wrote:
> [..]
>> This failure is not caused by this patchset, and you should get similar
>> failures on master if you get these while testing my patchset. I am not
>> able to reproduce this issue, but i'll double-check the bounded reserve
>> code with a fine-toothed comb anyway.
> 
> I reliably get the failure with V3 applied to RC2, but so far haven't 
> been able to replicate with any other combination (clean RC2, RC2+V5, 
> master+V4, etc). Odd..
> 

I've retested on plain v3+rc2 (i was previously testing on master), and 
i can reproduce the failure as well. The bug that was causing this issue 
was fixed on rebase in v4. So, rc2+v4/v5 no longer has this issue.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v6 0/3] Improve zero-length memzone allocation
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
@ 2018-05-31  9:50         ` Anatoly Burakov
  2018-07-13  9:24           ` Thomas Monjalon
  2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 1/3] malloc: add finding biggest free IOVA-contiguous element Anatoly Burakov
                           ` (2 subsequent siblings)
  3 siblings, 1 reply; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-31  9:50 UTC (permalink / raw)
  To: dev

This patchset does two things. First, it enables reserving
memzones of zero-length that are IOVA-contiguous. Second,
it fixes a long-standing race condition in reserving
zero-length memzones, where malloc heap is not locked between
stats collection and reservation, and will instead allocate
biggest element on the spot.

Some limitations are added, but they are a trade-off between
not having race conditions and user convenience. It would be
possible to lock all heaps during memzone reserve for zero-
length, and that would keep the old behavior, but given how
such allocation (especially when asking for IOVA-contiguous
memory) may take a long time, a design decision was made to
keep things simple, and only check other heaps if the
current one is completely busy.

Ideas on improvement are welcome.

v6:
- Rebase on top of 18.05
- Dropped malloc stats changes as no deprecation notice was
  sent, and i would like to integrate these changes in this
  release :)

v5:
- Use bound length if reserving memzone with zero length

v4:
- Fixes in memzone test
- Account for element padding
- Fix for wrong memzone size being returned
- Documentation fixes

Anatoly Burakov (3):
  malloc: add finding biggest free IOVA-contiguous element
  malloc: allow reserving biggest element
  memzone: improve zero-length memzone reserve

 lib/librte_eal/common/eal_common_memzone.c  |  70 ++-------
 lib/librte_eal/common/include/rte_memzone.h |  24 ++-
 lib/librte_eal/common/malloc_elem.c         |  79 ++++++++++
 lib/librte_eal/common/malloc_elem.h         |   6 +
 lib/librte_eal/common/malloc_heap.c         | 126 +++++++++++++++
 lib/librte_eal/common/malloc_heap.h         |   4 +
 test/test/test_memzone.c                    | 165 +++++++++++---------
 7 files changed, 343 insertions(+), 131 deletions(-)

-- 
2.17.0

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v6 1/3] malloc: add finding biggest free IOVA-contiguous element
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
  2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 " Anatoly Burakov
@ 2018-05-31  9:50         ` Anatoly Burakov
  2018-05-31  9:51         ` [dpdk-dev] [PATCH v6 2/3] malloc: allow reserving biggest element Anatoly Burakov
  2018-05-31  9:51         ` [dpdk-dev] [PATCH v6 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-31  9:50 UTC (permalink / raw)
  To: dev

Adding internal-only function to find biggest free IOVA-contiguous
malloc element. This is not exposed to external API.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
Acked-by: Shreyansh Jain <Shreyansh.jain@nxp.com>
---

Notes:
    v6:
    - Patch was postponed to 18.08 but i forgot to add
      deprecation notice for the API changes, so
      these external malloc stats API changes have been
      dropped from this patchset
    
    v4:
    - Fix comments to be more up to date with v4 code
    - Add comments explaining trailer handling
    
    v2:
    - Add header to newly recalculated element start
    
    v2:
    - Add header to newly recalculated element start

 lib/librte_eal/common/malloc_elem.c | 79 +++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_elem.h |  6 +++
 2 files changed, 85 insertions(+)

diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 9bfe9b9b4..f1bb4fee7 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -18,10 +18,89 @@
 #include <rte_common.h>
 #include <rte_spinlock.h>
 
+#include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
+{
+	void *cur_page, *contig_seg_start, *page_end, *cur_seg_end;
+	void *data_start, *data_end;
+	rte_iova_t expected_iova;
+	struct rte_memseg *ms;
+	size_t page_sz, cur, max;
+
+	page_sz = (size_t)elem->msl->page_sz;
+	data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
+	data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN);
+	/* segment must start after header and with specified alignment */
+	contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align);
+
+	/* if we're in IOVA as VA mode, or if we're in legacy mode with
+	 * hugepages, all elements are IOVA-contiguous.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA ||
+			(internal_config.legacy_mem && rte_eal_has_hugepages()))
+		return RTE_PTR_DIFF(data_end, contig_seg_start);
+
+	cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz);
+	ms = rte_mem_virt2memseg(cur_page, elem->msl);
+
+	/* do first iteration outside the loop */
+	page_end = RTE_PTR_ADD(cur_page, page_sz);
+	cur_seg_end = RTE_MIN(page_end, data_end);
+	cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) -
+			MALLOC_ELEM_TRAILER_LEN;
+	max = cur;
+	expected_iova = ms->iova + page_sz;
+	/* memsegs are contiguous in memory */
+	ms++;
+
+	cur_page = RTE_PTR_ADD(cur_page, page_sz);
+
+	while (cur_page < data_end) {
+		page_end = RTE_PTR_ADD(cur_page, page_sz);
+		cur_seg_end = RTE_MIN(page_end, data_end);
+
+		/* reset start of contiguous segment if unexpected iova */
+		if (ms->iova != expected_iova) {
+			/* next contiguous segment must start at specified
+			 * alignment.
+			 */
+			contig_seg_start = RTE_PTR_ALIGN(cur_page, align);
+			/* new segment start may be on a different page, so find
+			 * the page and skip to next iteration to make sure
+			 * we're not blowing past data end.
+			 */
+			ms = rte_mem_virt2memseg(contig_seg_start, elem->msl);
+			cur_page = ms->addr;
+			/* don't trigger another recalculation */
+			expected_iova = ms->iova;
+			continue;
+		}
+		/* cur_seg_end ends on a page boundary or on data end. if we're
+		 * looking at data end, then malloc trailer is already included
+		 * in the calculations. if we're looking at page end, then we
+		 * know there's more data past this page and thus there's space
+		 * for malloc element trailer, so don't count it here.
+		 */
+		cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start);
+		/* update max if cur value is bigger */
+		if (cur > max)
+			max = cur;
+
+		/* move to next page */
+		cur_page = page_end;
+		expected_iova = ms->iova + page_sz;
+		/* memsegs are contiguous in memory */
+		ms++;
+	}
+
+	return max;
+}
+
 /*
  * Initialize a general malloc_elem header structure
  */
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 7331af9ca..e2bda4c02 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -179,4 +179,10 @@ malloc_elem_free_list_index(size_t size);
 void
 malloc_elem_free_list_insert(struct malloc_elem *elem);
 
+/*
+ * Find biggest IOVA-contiguous zone within an element with specified alignment.
+ */
+size_t
+malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align);
+
 #endif /* MALLOC_ELEM_H_ */
-- 
2.17.0

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v6 2/3] malloc: allow reserving biggest element
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
  2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 " Anatoly Burakov
  2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 1/3] malloc: add finding biggest free IOVA-contiguous element Anatoly Burakov
@ 2018-05-31  9:51         ` Anatoly Burakov
  2018-05-31  9:51         ` [dpdk-dev] [PATCH v6 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-31  9:51 UTC (permalink / raw)
  To: dev

Add an internal-only function to allocate biggest element from
the heap. Nominally, it supports SOCKET_ID_ANY as its socket
argument, but it's essentially useless because other sockets
will only be allocated from if the entire heap on current or
specified socket is busy.

Still, asking to reserve a biggest element will allow fixing
race condition in memzone reserve that has been there for a
long time.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---
 lib/librte_eal/common/malloc_heap.c | 126 ++++++++++++++++++++++++++++
 lib/librte_eal/common/malloc_heap.h |   4 +
 2 files changed, 130 insertions(+)

diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index d6cf3af81..12aaf2d72 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -148,6 +148,52 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
 	return NULL;
 }
 
+/*
+ * Iterates through the freelist for a heap to find a free element with the
+ * biggest size and requested alignment. Will also set size to whatever element
+ * size that was found.
+ * Returns null on failure, or pointer to element on success.
+ */
+static struct malloc_elem *
+find_biggest_element(struct malloc_heap *heap, size_t *size,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem, *max_elem = NULL;
+	size_t idx, max_size = 0;
+
+	for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) {
+		for (elem = LIST_FIRST(&heap->free_head[idx]);
+				!!elem; elem = LIST_NEXT(elem, free_list)) {
+			size_t cur_size;
+			if (!check_hugepage_sz(flags, elem->msl->page_sz))
+				continue;
+			if (contig) {
+				cur_size =
+					malloc_elem_find_max_iova_contig(elem,
+							align);
+			} else {
+				void *data_start = RTE_PTR_ADD(elem,
+						MALLOC_ELEM_HEADER_LEN);
+				void *data_end = RTE_PTR_ADD(elem, elem->size -
+						MALLOC_ELEM_TRAILER_LEN);
+				void *aligned = RTE_PTR_ALIGN_CEIL(data_start,
+						align);
+				/* check if aligned data start is beyond end */
+				if (aligned >= data_end)
+					continue;
+				cur_size = RTE_PTR_DIFF(data_end, aligned);
+			}
+			if (cur_size > max_size) {
+				max_size = cur_size;
+				max_elem = elem;
+			}
+		}
+	}
+
+	*size = max_size;
+	return max_elem;
+}
+
 /*
  * Main function to allocate a block of memory from the heap.
  * It locks the free list, scans it, and adds a new memseg if the
@@ -174,6 +220,26 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
 	return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
+static void *
+heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused,
+		unsigned int flags, size_t align, bool contig)
+{
+	struct malloc_elem *elem;
+	size_t size;
+
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+
+	elem = find_biggest_element(heap, &size, flags, align, contig);
+	if (elem != NULL) {
+		elem = malloc_elem_alloc(elem, size, align, 0, contig);
+
+		/* increase heap's count of allocated elements */
+		heap->alloc_count++;
+	}
+
+	return elem == NULL ? NULL : (void *)(&elem[1]);
+}
+
 /* this function is exposed in malloc_mp.h */
 void
 rollback_expand_heap(struct rte_memseg **ms, int n_segs,
@@ -575,6 +641,66 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
 	return NULL;
 }
 
+static void *
+heap_alloc_biggest_on_socket(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+	void *ret;
+
+	rte_spinlock_lock(&(heap->lock));
+
+	align = align == 0 ? 1 : align;
+
+	ret = heap_alloc_biggest(heap, type, flags, align, contig);
+
+	rte_spinlock_unlock(&(heap->lock));
+
+	return ret;
+}
+
+void *
+malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags,
+		size_t align, bool contig)
+{
+	int socket, i, cur_socket;
+	void *ret;
+
+	/* return NULL if align is not power-of-2 */
+	if ((align && !rte_is_power_of_2(align)))
+		return NULL;
+
+	if (!rte_eal_has_hugepages())
+		socket_arg = SOCKET_ID_ANY;
+
+	if (socket_arg == SOCKET_ID_ANY)
+		socket = malloc_get_numa_socket();
+	else
+		socket = socket_arg;
+
+	/* Check socket parameter */
+	if (socket >= RTE_MAX_NUMA_NODES)
+		return NULL;
+
+	ret = heap_alloc_biggest_on_socket(type, socket, flags, align,
+			contig);
+	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+		return ret;
+
+	/* try other heaps */
+	for (i = 0; i < (int) rte_socket_count(); i++) {
+		cur_socket = rte_socket_id_by_idx(i);
+		if (cur_socket == socket)
+			continue;
+		ret = heap_alloc_biggest_on_socket(type, cur_socket, flags,
+				align, contig);
+		if (ret != NULL)
+			return ret;
+	}
+	return NULL;
+}
+
 /* this function is exposed in malloc_mp.h */
 int
 malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index 03b801415..f52cb5559 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -29,6 +29,10 @@ void *
 malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
 		size_t align, size_t bound, bool contig);
 
+void *
+malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags,
+		size_t align, bool contig);
+
 int
 malloc_heap_free(struct malloc_elem *elem);
 
-- 
2.17.0

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [dpdk-dev] [PATCH v6 3/3] memzone: improve zero-length memzone reserve
  2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
                           ` (2 preceding siblings ...)
  2018-05-31  9:51         ` [dpdk-dev] [PATCH v6 2/3] malloc: allow reserving biggest element Anatoly Burakov
@ 2018-05-31  9:51         ` Anatoly Burakov
  3 siblings, 0 replies; 32+ messages in thread
From: Anatoly Burakov @ 2018-05-31  9:51 UTC (permalink / raw)
  To: dev; +Cc: sergio.gonzalez.monroy

Currently, reserving zero-length memzones is done by looking at
malloc statistics, and reserving biggest sized element found in those
statistics. This has two issues.

First, there is a race condition. The heap is unlocked between the
time we check stats, and the time we reserve malloc element for memzone.
This may lead to inability to reserve the memzone we wanted to reserve,
because another allocation might have taken place and biggest sized
element may no longer be available.

Second, the size returned by malloc statistics does not include any
alignment information, which is worked around by being conservative and
subtracting alignment length from the final result. This leads to
fragmentation and reserving memzones that could have been bigger but
aren't.

Fix all of this by using earlier-introduced operation to reserve
biggest possible malloc element. This, however, comes with a trade-off,
because we can only lock one heap at a time. So, if we check the first
available heap and find *any* element at all, that element will be
considered "the biggest", even though other heaps might have bigger
elements. We cannot know what other heaps have before we try and
allocate it, and it is not a good idea to lock all of the heaps at
the same time, so, we will just document this limitation and
encourage users to reserve memzones with socket id properly set.

Also, fixup unit tests to account for the new behavior.

Fixes: fafcc11985a2 ("mem: rework memzone to be allocated by malloc")
Cc: sergio.gonzalez.monroy@intel.com

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v6:
    - Rebase on 18.05
    
    v5:
    - Use bound len when reserving bounded zero-length memzones
    
    v4:
    - Rebased on latest master
    - Improved documentation
    - Added accounting for element pad [1]
    - Fixed max len underflow in test
    - Checkpatch fixes
    
    [1] A patch has recently fixed a similar issue:
    
    https://dpdk.org/dev/patchwork/patch/39332/
    
    The accounting for padding is also needed because size of the element
    may include not only malloc header overhead, but also the padding if
    it has any.
    
    At first glance, it would seem like additional change is needed for
    pre-18.05 code as well. However, on closer inspection, the original code
    was incorrect because it was comparing requested_len to 0, which is never
    zero and is always a minimum of cache line size due to earlier RTE_MAX()
    call (or rather, it could be zero, but in that case it would fail earlier).
    This downgrades the above quoted bug from "potential memory corruption bug"
    to "this bug was never a bug due to another bug".
    
    A proper fix for pre-18.05 would be to remove the check altogether and
    always go by requested_len, which is what we use to reserve memzones
    in the first place. I will submit it separately.

 lib/librte_eal/common/eal_common_memzone.c  |  70 ++-------
 lib/librte_eal/common/include/rte_memzone.h |  24 ++-
 test/test/test_memzone.c                    | 165 +++++++++++---------
 3 files changed, 128 insertions(+), 131 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index faa3b0615..7300fe05d 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -52,38 +52,6 @@ memzone_lookup_thread_unsafe(const char *name)
 	return NULL;
 }
 
-
-/* This function will return the greatest free block if a heap has been
- * specified. If no heap has been specified, it will return the heap and
- * length of the greatest free block available in all heaps */
-static size_t
-find_heap_max_free_elem(int *s, unsigned align)
-{
-	struct rte_mem_config *mcfg;
-	struct rte_malloc_socket_stats stats;
-	int i, socket = *s;
-	size_t len = 0;
-
-	/* get pointer to global configuration */
-	mcfg = rte_eal_get_configuration()->mem_config;
-
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		if ((socket != SOCKET_ID_ANY) && (socket != i))
-			continue;
-
-		malloc_heap_get_stats(&mcfg->malloc_heaps[i], &stats);
-		if (stats.greatest_free_size > len) {
-			len = stats.greatest_free_size;
-			*s = i;
-		}
-	}
-
-	if (len < MALLOC_ELEM_OVERHEAD + align)
-		return 0;
-
-	return len - MALLOC_ELEM_OVERHEAD - align;
-}
-
 static const struct rte_memzone *
 memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		int socket_id, unsigned int flags, unsigned int align,
@@ -92,6 +60,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	struct rte_memzone *mz;
 	struct rte_mem_config *mcfg;
 	struct rte_fbarray *arr;
+	void *mz_addr;
 	size_t requested_len;
 	int mz_idx;
 	bool contig;
@@ -140,8 +109,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		return NULL;
 	}
 
-	len += RTE_CACHE_LINE_MASK;
-	len &= ~((size_t) RTE_CACHE_LINE_MASK);
+	len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE);
 
 	/* save minimal requested  length */
 	requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE,  len);
@@ -165,27 +133,18 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	/* malloc only cares about size flags, remove contig flag from flags */
 	flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
-	if (len == 0) {
-		/* len == 0 is only allowed for non-contiguous zones */
-		if (contig) {
-			RTE_LOG(DEBUG, EAL, "Reserving zero-length contiguous memzones is not supported\n");
-			rte_errno = EINVAL;
-			return NULL;
-		}
-		if (bound != 0)
+	if (len == 0 && bound == 0) {
+		/* no size constraints were placed, so use malloc elem len */
+		requested_len = 0;
+		mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags,
+				align, contig);
+	} else {
+		if (len == 0)
 			requested_len = bound;
-		else {
-			requested_len = find_heap_max_free_elem(&socket_id, align);
-			if (requested_len == 0) {
-				rte_errno = ENOMEM;
-				return NULL;
-			}
-		}
+		/* allocate memory on heap */
+		mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id,
+				flags, align, bound, contig);
 	}
-
-	/* allocate memory on heap */
-	void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, flags,
-			align, bound, contig);
 	if (mz_addr == NULL) {
 		rte_errno = ENOMEM;
 		return NULL;
@@ -213,8 +172,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	snprintf(mz->name, sizeof(mz->name), "%s", name);
 	mz->iova = rte_malloc_virt2iova(mz_addr);
 	mz->addr = mz_addr;
-	mz->len = (requested_len == 0 ?
-			(elem->size - MALLOC_ELEM_OVERHEAD) : requested_len);
+	mz->len = requested_len == 0 ?
+			elem->size - elem->pad - MALLOC_ELEM_OVERHEAD :
+			requested_len;
 	mz->hugepage_sz = elem->msl->page_sz;
 	mz->socket_id = elem->msl->socket_id;
 	mz->flags = 0;
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index ef370fa6f..f478fa9e6 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -81,8 +81,12 @@ struct rte_memzone {
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
- * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently
- *   supported.
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
  *
  * @param name
  *   The name of the memzone. If it already exists, the function will
@@ -141,8 +145,12 @@ const struct rte_memzone *rte_memzone_reserve(const char *name,
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
- * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently
- *   supported.
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
  *
  * @param name
  *   The name of the memzone. If it already exists, the function will
@@ -206,8 +214,12 @@ const struct rte_memzone *rte_memzone_reserve_aligned(const char *name,
  *   memzones from memory that is already available. It will not trigger any
  *   new allocations.
  *
- * @note Reserving IOVA-contiguous memzones with len set to 0 is not currently
- *   supported.
+ * @note: When reserving memzones with len set to 0, it is preferable to also
+ *   set a valid socket_id. Setting socket_id to SOCKET_ID_ANY is supported, but
+ *   will likely not yield expected results. Specifically, the resulting memzone
+ *   may not necessarily be the biggest memzone available, but rather biggest
+ *   memzone available on socket id corresponding to an lcore from which
+ *   reservation was called.
  *
  * @param name
  *   The name of the memzone. If it already exists, the function will
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index efcf7327e..452d7cc5e 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -467,61 +467,69 @@ test_memzone_reserve_flags(void)
 
 /* Find the heap with the greatest free block size */
 static size_t
-find_max_block_free_size(const unsigned _align)
+find_max_block_free_size(unsigned int align, unsigned int socket_id)
 {
 	struct rte_malloc_socket_stats stats;
-	unsigned i, align = _align;
-	size_t len = 0;
+	size_t len, overhead;
 
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		rte_malloc_get_socket_stats(i, &stats);
-		if (stats.greatest_free_size > len)
-			len = stats.greatest_free_size;
-	}
+	rte_malloc_get_socket_stats(socket_id, &stats);
+
+	len = stats.greatest_free_size;
+	overhead = MALLOC_ELEM_OVERHEAD;
+
+	if (len == 0)
+		return 0;
 
-	if (align < RTE_CACHE_LINE_SIZE)
-		align = RTE_CACHE_LINE_ROUNDUP(align+1);
+	align = RTE_CACHE_LINE_ROUNDUP(align);
+	overhead += align;
 
-	if (len <= MALLOC_ELEM_OVERHEAD + align)
+	if (len < overhead)
 		return 0;
 
-	return len - MALLOC_ELEM_OVERHEAD - align;
+	return len - overhead;
 }
 
 static int
 test_memzone_reserve_max(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen;
+	unsigned int i;
 
-	maxlen = find_max_block_free_size(0);
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen;
+		int socket;
 
-	if (maxlen == 0) {
-		printf("There is no space left!\n");
-		return 0;
-	}
+		socket = rte_socket_id_by_idx(i);
+		maxlen = find_max_block_free_size(0, socket);
 
-	mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
-			SOCKET_ID_ANY, 0);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (maxlen == 0) {
+			printf("There is no space left!\n");
+			return 0;
+		}
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size did not return bigest block\n");
-		printf("Expected size = %zu, actual size = %zu\n", maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		mz = rte_memzone_reserve(TEST_MEMZONE_NAME("max_zone"), 0,
+				socket, 0);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
+		if (mz->len != maxlen) {
+			printf("Memzone reserve with 0 size did not return bigest block\n");
+			printf("Expected size = %zu, actual size = %zu\n",
+					maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
 	}
 
 	return 0;
@@ -530,45 +538,62 @@ test_memzone_reserve_max(void)
 static int
 test_memzone_reserve_max_aligned(void)
 {
-	const struct rte_memzone *mz;
-	size_t maxlen = 0;
+	unsigned int i;
 
-	/* random alignment */
-	rte_srand((unsigned)rte_rdtsc());
-	const unsigned align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
+	for (i = 0; i < rte_socket_count(); i++) {
+		const struct rte_memzone *mz;
+		size_t maxlen, minlen = 0;
+		int socket;
 
-	maxlen = find_max_block_free_size(align);
+		socket = rte_socket_id_by_idx(i);
 
-	if (maxlen == 0) {
-		printf("There is no space left for biggest %u-aligned memzone!\n", align);
-		return 0;
-	}
+		/* random alignment */
+		rte_srand((unsigned int)rte_rdtsc());
+		const unsigned int align = 1 << ((rte_rand() % 8) + 5); /* from 128 up to 4k alignment */
 
-	mz = rte_memzone_reserve_aligned(TEST_MEMZONE_NAME("max_zone_aligned"),
-			0, SOCKET_ID_ANY, 0, align);
-	if (mz == NULL){
-		printf("Failed to reserve a big chunk of memory - %s\n",
-				rte_strerror(rte_errno));
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		/* memzone size may be between size and size - align */
+		minlen = find_max_block_free_size(align, socket);
+		maxlen = find_max_block_free_size(0, socket);
 
-	if (mz->len != maxlen) {
-		printf("Memzone reserve with 0 size and alignment %u did not return"
-				" bigest block\n", align);
-		printf("Expected size = %zu, actual size = %zu\n",
-				maxlen, mz->len);
-		rte_dump_physmem_layout(stdout);
-		rte_memzone_dump(stdout);
-		return -1;
-	}
+		if (minlen == 0 || maxlen == 0) {
+			printf("There is no space left for biggest %u-aligned memzone!\n",
+					align);
+			return 0;
+		}
 
-	if (rte_memzone_free(mz)) {
-		printf("Fail memzone free\n");
-		return -1;
-	}
+		mz = rte_memzone_reserve_aligned(
+				TEST_MEMZONE_NAME("max_zone_aligned"),
+				0, socket, 0, align);
+		if (mz == NULL) {
+			printf("Failed to reserve a big chunk of memory - %s\n",
+					rte_strerror(rte_errno));
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+		if (mz->addr != RTE_PTR_ALIGN(mz->addr, align)) {
+			printf("Memzone reserve with 0 size and alignment %u did not return aligned block\n",
+					align);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
 
+		if (mz->len < minlen || mz->len > maxlen) {
+			printf("Memzone reserve with 0 size and alignment %u did not return"
+					" bigest block\n", align);
+			printf("Expected size = %zu-%zu, actual size = %zu\n",
+					minlen, maxlen, mz->len);
+			rte_dump_physmem_layout(stdout);
+			rte_memzone_dump(stdout);
+			return -1;
+		}
+
+		if (rte_memzone_free(mz)) {
+			printf("Fail memzone free\n");
+			return -1;
+		}
+	}
 	return 0;
 }
 
-- 
2.17.0

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/3] Improve zero-length memzone allocation
  2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 " Anatoly Burakov
@ 2018-07-13  9:24           ` Thomas Monjalon
  0 siblings, 0 replies; 32+ messages in thread
From: Thomas Monjalon @ 2018-07-13  9:24 UTC (permalink / raw)
  To: Anatoly Burakov; +Cc: dev

31/05/2018 11:50, Anatoly Burakov:
> This patchset does two things. First, it enables reserving
> memzones of zero-length that are IOVA-contiguous. Second,
> it fixes a long-standing race condition in reserving
> zero-length memzones, where malloc heap is not locked between
> stats collection and reservation, and will instead allocate
> biggest element on the spot.
> 
> Some limitations are added, but they are a trade-off between
> not having race conditions and user convenience. It would be
> possible to lock all heaps during memzone reserve for zero-
> length, and that would keep the old behavior, but given how
> such allocation (especially when asking for IOVA-contiguous
> memory) may take a long time, a design decision was made to
> keep things simple, and only check other heaps if the
> current one is completely busy.
> 
> Ideas on improvement are welcome.
> 
> Anatoly Burakov (3):
>   malloc: add finding biggest free IOVA-contiguous element
>   malloc: allow reserving biggest element
>   memzone: improve zero-length memzone reserve

Applied, thanks

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2018-07-13  9:24 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-25 14:10 [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
2018-04-25 14:10 ` [dpdk-dev] [PATCH 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov
2018-04-25 14:40 ` [dpdk-dev] [PATCH 1/2] malloc: add biggest free IOVA-contiguous element to stats Burakov, Anatoly
2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 " Anatoly Burakov
2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 0/3] Improve zero-length memzone allocation Anatoly Burakov
2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 " Anatoly Burakov
2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 " Anatoly Burakov
2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 " Anatoly Burakov
2018-07-13  9:24           ` Thomas Monjalon
2018-05-31  9:50         ` [dpdk-dev] [PATCH v6 1/3] malloc: add finding biggest free IOVA-contiguous element Anatoly Burakov
2018-05-31  9:51         ` [dpdk-dev] [PATCH v6 2/3] malloc: allow reserving biggest element Anatoly Burakov
2018-05-31  9:51         ` [dpdk-dev] [PATCH v6 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
2018-05-14 13:58         ` Shreyansh Jain
2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 2/3] malloc: allow reserving biggest element Anatoly Burakov
2018-05-14 13:47       ` [dpdk-dev] [PATCH v5 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 2/3] malloc: allow reserving biggest element Anatoly Burakov
2018-05-14 11:19     ` [dpdk-dev] [PATCH v4 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
2018-05-03 17:17   ` [dpdk-dev] [PATCH v3 1/3] malloc: add biggest free IOVA-contiguous element to stats Anatoly Burakov
2018-05-10 13:39     ` Remy Horton
2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 2/3] malloc: allow reserving biggest element Anatoly Burakov
2018-05-10 13:57     ` Remy Horton
2018-05-14  8:22       ` Burakov, Anatoly
2018-05-03 17:18   ` [dpdk-dev] [PATCH v3 3/3] memzone: improve zero-length memzone reserve Anatoly Burakov
2018-05-11 10:25     ` Remy Horton
2018-05-14  8:21       ` Burakov, Anatoly
2018-05-14 11:29         ` Burakov, Anatoly
2018-05-14 12:23           ` Burakov, Anatoly
2018-05-15  6:24           ` Remy Horton
2018-05-15  8:37             ` Burakov, Anatoly
2018-04-26  8:06 ` [dpdk-dev] [PATCH v2 2/2] memzone: allow IOVA-contiguous memzones with zero size Anatoly Burakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).