DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 1/2] mempool/cnxk: limit usage of async allocs
@ 2023-07-31  5:55 Ashwin Sekhar T K
  2023-07-31  5:55 ` [PATCH 2/2] mempool/cnxk: fix alloc from non-EAL pthreads Ashwin Sekhar T K
  2023-08-22 17:01 ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
  0 siblings, 2 replies; 7+ messages in thread
From: Ashwin Sekhar T K @ 2023-07-31  5:55 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ashwin Sekhar T K, Pavan Nikhilesh
  Cc: jerinj, psatheesh, anoobj, gakhil, hkalra

Currently mempool_cnxk driver uses asynchronous allocation
for all pools. This asynchronous allocation can result in
local caching of additional 512 objects on a single core
even when cache is disabled. This will eventually lead to
starvation on pools where the number of objects is very less.

This commit changes this logic to use asynchronous allocation on
only those pools which have local cache enabled. Also the async buffer
size will be RTE_ALIGN_CEIL(rte_mempool->cache_size, 16). This
means that when cache is disabled, async alloc will be completely
disabled and when cache is enabled, the additional caching due
to asynchronous allocation will be limited.

A limitation has been added to cnxk documentation warning the users
to adjust the local cache sizes accordingly.

Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>
---
 doc/guides/nics/cnxk.rst                 |  18 ++++
 drivers/mempool/cnxk/cn10k_mempool_ops.c | 103 ++++++++++++++++++++---
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index 9229056f6f..5d90d22e2b 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -433,6 +433,24 @@ The OCTEON CN9K/CN10K SoC family NIC has inbuilt HW assisted external mempool ma
 as it is performance wise most effective way for packet allocation and Tx buffer
 recycling on OCTEON 9 SoC platform.
 
+``mempool_cnxk`` rte_mempool cache sizes for CN10K
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The OCTEON CN10K SoC Family supports asynchronous batch allocation of
+objects from an NPA pool. In the CNXK mempool driver, asynchronous batch
+allocation is enabled when local caches are enabled. This asynchronous
+batch allocation will be using an additional local async buffer whose size
+will be equal to ``RTE_ALIGN_CEIL(rte_mempool->cache_size, 16)``. This can
+result in additional objects being cached locally. While creating an
+rte_mempool using ``mempool_cnxk`` driver for OCTEON CN10K, this must be
+taken into consideration and the local cache sizes should be adjusted
+accordingly so that starvation does not happen.
+
+For Eg: If the ``cache_size`` passed into ``rte_mempool_create`` is ``8``,
+then the max objects than can get cached locally on a core would be the
+sum of max objects in the local cache + max objects in the async buffer
+i.e ``8 + RTE_ALIGN_CEIL(8, 16) = 24``.
+
 CRC stripping
 ~~~~~~~~~~~~~
 
diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c b/drivers/mempool/cnxk/cn10k_mempool_ops.c
index ff0015d8de..41b755b52b 100644
--- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
+++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
@@ -10,6 +10,7 @@
 #define BATCH_ALLOC_SZ              ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS
 #define BATCH_OP_DATA_TABLE_MZ_NAME "batch_op_data_table_mz"
 #define BATCH_ALLOC_WAIT_US         5
+#define BATCH_ALLOC_RETRIES         4
 
 enum batch_op_status {
 	BATCH_ALLOC_OP_NOT_ISSUED = 0,
@@ -25,6 +26,7 @@ struct batch_op_mem {
 
 struct batch_op_data {
 	uint64_t lmt_addr;
+	uint32_t max_async_batch;
 	struct batch_op_mem mem[RTE_MAX_LCORE] __rte_aligned(ROC_ALIGN);
 };
 
@@ -97,6 +99,10 @@ batch_op_init(struct rte_mempool *mp)
 	}
 
 	op_data->lmt_addr = roc_idev_lmt_base_addr_get();
+	op_data->max_async_batch =
+		RTE_MIN((unsigned int)BATCH_ALLOC_SZ,
+			RTE_ALIGN_CEIL(mp->cache_size, ROC_ALIGN / 8));
+
 	batch_op_data_set(mp->pool_id, op_data);
 	rte_wmb();
 
@@ -117,13 +123,17 @@ batch_op_fini(struct rte_mempool *mp)
 		return;
 	}
 
+	/* If max_async_batch == 0, then batch mem will be empty */
+	if (op_data->max_async_batch == 0)
+		goto free_op_data;
+
 	rte_wmb();
 	for (i = 0; i < RTE_MAX_LCORE; i++) {
 		struct batch_op_mem *mem = &op_data->mem[i];
 
 		if (mem->status == BATCH_ALLOC_OP_ISSUED) {
 			mem->sz = roc_npa_aura_batch_alloc_extract(
-				mem->objs, mem->objs, BATCH_ALLOC_SZ);
+				mem->objs, mem->objs, op_data->max_async_batch);
 			mem->status = BATCH_ALLOC_OP_DONE;
 		}
 		if (mem->status == BATCH_ALLOC_OP_DONE) {
@@ -133,6 +143,7 @@ batch_op_fini(struct rte_mempool *mp)
 		}
 	}
 
+free_op_data:
 	rte_free(op_data);
 	batch_op_data_set(mp->pool_id, NULL);
 	rte_wmb();
@@ -172,6 +183,9 @@ cn10k_mempool_get_count(const struct rte_mempool *mp)
 	int i;
 
 	op_data = batch_op_data_get(mp->pool_id);
+	/* If max_async_batch == 0, then batch alloc mem will be empty */
+	if (op_data->max_async_batch == 0)
+		goto npa_pool_count;
 
 	rte_wmb();
 	for (i = 0; i < RTE_MAX_LCORE; i++) {
@@ -179,19 +193,27 @@ cn10k_mempool_get_count(const struct rte_mempool *mp)
 
 		if (mem->status == BATCH_ALLOC_OP_ISSUED)
 			count += roc_npa_aura_batch_alloc_count(
-				mem->objs, BATCH_ALLOC_SZ, BATCH_ALLOC_WAIT_US);
+				mem->objs, op_data->max_async_batch,
+				BATCH_ALLOC_WAIT_US);
 
 		if (mem->status == BATCH_ALLOC_OP_DONE)
 			count += mem->sz;
 	}
 
+npa_pool_count:
 	count += cnxk_mempool_get_count(mp);
 
 	return count;
 }
 
-static int __rte_hot
-cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+static inline unsigned int __rte_hot
+mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	return cnxk_mempool_deq(mp, obj_table, n) ? 0 : n;
+}
+
+static inline unsigned int __rte_hot
+mempool_deq_batch_async(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
 	struct batch_op_data *op_data;
 	struct batch_op_mem *mem;
@@ -205,24 +227,24 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 
 	/* Issue batch alloc */
 	if (mem->status == BATCH_ALLOC_OP_NOT_ISSUED) {
-		rc = roc_npa_aura_batch_alloc_issue(mp->pool_id, mem->objs,
-						    BATCH_ALLOC_SZ, 0, 1);
+		rc = roc_npa_aura_batch_alloc_issue(
+			mp->pool_id, mem->objs, op_data->max_async_batch, 0, 1);
 		/* If issue fails, try falling back to default alloc */
 		if (unlikely(rc))
-			return cnxk_mempool_deq(mp, obj_table, n);
+			return mempool_deq(mp, obj_table, n);
 		mem->status = BATCH_ALLOC_OP_ISSUED;
 	}
 
-	retry = 4;
+	retry = BATCH_ALLOC_RETRIES;
 	while (loop) {
 		unsigned int cur_sz;
 
 		if (mem->status == BATCH_ALLOC_OP_ISSUED) {
 			mem->sz = roc_npa_aura_batch_alloc_extract(
-				mem->objs, mem->objs, BATCH_ALLOC_SZ);
+				mem->objs, mem->objs, op_data->max_async_batch);
 
 			/* If partial alloc reduce the retry count */
-			retry -= (mem->sz != BATCH_ALLOC_SZ);
+			retry -= (mem->sz != op_data->max_async_batch);
 			/* Break the loop if retry count exhausted */
 			loop = !!retry;
 			mem->status = BATCH_ALLOC_OP_DONE;
@@ -244,13 +266,72 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 		/* Issue next batch alloc if pointers are exhausted */
 		if (mem->sz == 0) {
 			rc = roc_npa_aura_batch_alloc_issue(
-				mp->pool_id, mem->objs, BATCH_ALLOC_SZ, 0, 1);
+				mp->pool_id, mem->objs,
+				op_data->max_async_batch, 0, 1);
 			/* Break loop if issue failed and set status */
 			loop &= !rc;
 			mem->status = !rc;
 		}
 	}
 
+	return count;
+}
+
+static inline unsigned int __rte_hot
+mempool_deq_batch_sync(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	struct batch_op_data *op_data;
+	struct batch_op_mem *mem;
+	unsigned int count = 0;
+	int tid, retry, rc;
+
+	op_data = batch_op_data_get(mp->pool_id);
+	tid = rte_lcore_id();
+	mem = &op_data->mem[tid];
+
+	retry = BATCH_ALLOC_RETRIES;
+	while (count != n && retry) {
+		unsigned int cur_sz, batch_sz;
+
+		cur_sz = n - count;
+		batch_sz = RTE_MIN(BATCH_ALLOC_SZ, (int)cur_sz);
+
+		/* Issue batch alloc */
+		rc = roc_npa_aura_batch_alloc_issue(mp->pool_id, mem->objs,
+						    batch_sz, 0, 1);
+
+		/* If issue fails, try falling back to default alloc */
+		if (unlikely(rc))
+			return count +
+			       mempool_deq(mp, obj_table + count, n - count);
+
+		cur_sz = roc_npa_aura_batch_alloc_extract(mem->objs, mem->objs,
+							  batch_sz);
+
+		/* Dequeue the pointers */
+		memcpy(&obj_table[count], mem->objs,
+		       cur_sz * sizeof(uintptr_t));
+		count += cur_sz;
+
+		/* If partial alloc reduce the retry count */
+		retry -= (batch_sz != cur_sz);
+	}
+
+	return count;
+}
+
+static int __rte_hot
+cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	struct batch_op_data *op_data;
+	unsigned int count = 0;
+
+	op_data = batch_op_data_get(mp->pool_id);
+	if (op_data->max_async_batch)
+		count = mempool_deq_batch_async(mp, obj_table, n);
+	else
+		count = mempool_deq_batch_sync(mp, obj_table, n);
+
 	if (unlikely(count != n)) {
 		/* No partial alloc allowed. Free up allocated pointers */
 		cn10k_mempool_enq(mp, obj_table, count);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/2] mempool/cnxk: fix alloc from non-EAL pthreads
  2023-07-31  5:55 [PATCH 1/2] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
@ 2023-07-31  5:55 ` Ashwin Sekhar T K
  2023-08-18  9:21   ` Jerin Jacob
  2023-08-22 17:01 ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
  1 sibling, 1 reply; 7+ messages in thread
From: Ashwin Sekhar T K @ 2023-07-31  5:55 UTC (permalink / raw)
  To: dev, Ashwin Sekhar T K, Pavan Nikhilesh
  Cc: jerinj, skori, skoteshwar, kirankumark, psatheesh, anoobj,
	gakhil, hkalra, ndabilpuram

For non-EAL pthreads, rte_lcore_id() will not be valid.
So, batch allocation cannot be used as we won't have a
dedicated alloc buffer for the thread. So, fallback to
bulk alloc in such cases.

Fixes: 91531e63f43b ("mempool/cnxk: add cn10k batch dequeue")

Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>
---
 drivers/mempool/cnxk/cn10k_mempool_ops.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c b/drivers/mempool/cnxk/cn10k_mempool_ops.c
index 41b755b52b..9594370ecd 100644
--- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
+++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
@@ -326,6 +326,12 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 	struct batch_op_data *op_data;
 	unsigned int count = 0;
 
+	/* For non-EAL threads, rte_lcore_id() will not be valid. Hence
+	 * fallback to bulk alloc
+	 */
+	if (unlikely(rte_lcore_id() == LCORE_ID_ANY))
+		return cnxk_mempool_deq(mp, obj_table, n);
+
 	op_data = batch_op_data_get(mp->pool_id);
 	if (op_data->max_async_batch)
 		count = mempool_deq_batch_async(mp, obj_table, n);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] mempool/cnxk: fix alloc from non-EAL pthreads
  2023-07-31  5:55 ` [PATCH 2/2] mempool/cnxk: fix alloc from non-EAL pthreads Ashwin Sekhar T K
@ 2023-08-18  9:21   ` Jerin Jacob
  0 siblings, 0 replies; 7+ messages in thread
From: Jerin Jacob @ 2023-08-18  9:21 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: dev, Pavan Nikhilesh, jerinj, skori, skoteshwar, kirankumark,
	psatheesh, anoobj, gakhil, hkalra, ndabilpuram

On Mon, Jul 31, 2023 at 11:25 AM Ashwin Sekhar T K <asekhar@marvell.com> wrote:
>
> For non-EAL pthreads, rte_lcore_id() will not be valid.
> So, batch allocation cannot be used as we won't have a
> dedicated alloc buffer for the thread. So, fallback to
> bulk alloc in such cases.
>
> Fixes: 91531e63f43b ("mempool/cnxk: add cn10k batch dequeue")
>
> Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>
> ---
>  drivers/mempool/cnxk/cn10k_mempool_ops.c | 6 ++++++
>  1 file changed, 6 insertions(+)
>
> diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c b/drivers/mempool/cnxk/cn10k_mempool_ops.c
> index 41b755b52b..9594370ecd 100644
> --- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
> +++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
> @@ -326,6 +326,12 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)



Looks like same problem on enq() as well, if so, please fix the same.


>         struct batch_op_data *op_data;
>         unsigned int count = 0;
>
> +       /* For non-EAL threads, rte_lcore_id() will not be valid. Hence
> +        * fallback to bulk alloc
> +        */
> +       if (unlikely(rte_lcore_id() == LCORE_ID_ANY))
> +               return cnxk_mempool_deq(mp, obj_table, n);
> +
>         op_data = batch_op_data_get(mp->pool_id);
>         if (op_data->max_async_batch)
>                 count = mempool_deq_batch_async(mp, obj_table, n);
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/3] mempool/cnxk: limit usage of async allocs
  2023-07-31  5:55 [PATCH 1/2] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
  2023-07-31  5:55 ` [PATCH 2/2] mempool/cnxk: fix alloc from non-EAL pthreads Ashwin Sekhar T K
@ 2023-08-22 17:01 ` Ashwin Sekhar T K
  2023-08-22 17:01   ` [PATCH 2/3] mempool/cnxk: fix free from non-EAL pthreads Ashwin Sekhar T K
                     ` (2 more replies)
  1 sibling, 3 replies; 7+ messages in thread
From: Ashwin Sekhar T K @ 2023-08-22 17:01 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Ashwin Sekhar T K, Pavan Nikhilesh
  Cc: jerinj, psatheesh, anoobj, gakhil, hkalra

Currently mempool_cnxk driver uses asynchronous allocation
for all pools. This asynchronous allocation can result in
local caching of additional 512 objects on a single core
even when cache is disabled. This will eventually lead to
starvation on pools where the number of objects is very less.

This commit changes this logic to use asynchronous allocation on
only those pools which have local cache enabled. Also the async buffer
size will be RTE_ALIGN_CEIL(rte_mempool->cache_size, 16). This
means that when cache is disabled, async alloc will be completely
disabled and when cache is enabled, the additional caching due
to asynchronous allocation will be limited.

A limitation has been added to cnxk documentation warning the users
to adjust the local cache sizes accordingly.

Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>
---
 doc/guides/nics/cnxk.rst                 |  18 ++++
 drivers/mempool/cnxk/cn10k_mempool_ops.c | 103 ++++++++++++++++++++---
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index 9229056f6f..5d90d22e2b 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -433,6 +433,24 @@ The OCTEON CN9K/CN10K SoC family NIC has inbuilt HW assisted external mempool ma
 as it is performance wise most effective way for packet allocation and Tx buffer
 recycling on OCTEON 9 SoC platform.
 
+``mempool_cnxk`` rte_mempool cache sizes for CN10K
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The OCTEON CN10K SoC Family supports asynchronous batch allocation of
+objects from an NPA pool. In the CNXK mempool driver, asynchronous batch
+allocation is enabled when local caches are enabled. This asynchronous
+batch allocation will be using an additional local async buffer whose size
+will be equal to ``RTE_ALIGN_CEIL(rte_mempool->cache_size, 16)``. This can
+result in additional objects being cached locally. While creating an
+rte_mempool using ``mempool_cnxk`` driver for OCTEON CN10K, this must be
+taken into consideration and the local cache sizes should be adjusted
+accordingly so that starvation does not happen.
+
+For Eg: If the ``cache_size`` passed into ``rte_mempool_create`` is ``8``,
+then the max objects than can get cached locally on a core would be the
+sum of max objects in the local cache + max objects in the async buffer
+i.e ``8 + RTE_ALIGN_CEIL(8, 16) = 24``.
+
 CRC stripping
 ~~~~~~~~~~~~~
 
diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c b/drivers/mempool/cnxk/cn10k_mempool_ops.c
index ff0015d8de..41b755b52b 100644
--- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
+++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
@@ -10,6 +10,7 @@
 #define BATCH_ALLOC_SZ              ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS
 #define BATCH_OP_DATA_TABLE_MZ_NAME "batch_op_data_table_mz"
 #define BATCH_ALLOC_WAIT_US         5
+#define BATCH_ALLOC_RETRIES         4
 
 enum batch_op_status {
 	BATCH_ALLOC_OP_NOT_ISSUED = 0,
@@ -25,6 +26,7 @@ struct batch_op_mem {
 
 struct batch_op_data {
 	uint64_t lmt_addr;
+	uint32_t max_async_batch;
 	struct batch_op_mem mem[RTE_MAX_LCORE] __rte_aligned(ROC_ALIGN);
 };
 
@@ -97,6 +99,10 @@ batch_op_init(struct rte_mempool *mp)
 	}
 
 	op_data->lmt_addr = roc_idev_lmt_base_addr_get();
+	op_data->max_async_batch =
+		RTE_MIN((unsigned int)BATCH_ALLOC_SZ,
+			RTE_ALIGN_CEIL(mp->cache_size, ROC_ALIGN / 8));
+
 	batch_op_data_set(mp->pool_id, op_data);
 	rte_wmb();
 
@@ -117,13 +123,17 @@ batch_op_fini(struct rte_mempool *mp)
 		return;
 	}
 
+	/* If max_async_batch == 0, then batch mem will be empty */
+	if (op_data->max_async_batch == 0)
+		goto free_op_data;
+
 	rte_wmb();
 	for (i = 0; i < RTE_MAX_LCORE; i++) {
 		struct batch_op_mem *mem = &op_data->mem[i];
 
 		if (mem->status == BATCH_ALLOC_OP_ISSUED) {
 			mem->sz = roc_npa_aura_batch_alloc_extract(
-				mem->objs, mem->objs, BATCH_ALLOC_SZ);
+				mem->objs, mem->objs, op_data->max_async_batch);
 			mem->status = BATCH_ALLOC_OP_DONE;
 		}
 		if (mem->status == BATCH_ALLOC_OP_DONE) {
@@ -133,6 +143,7 @@ batch_op_fini(struct rte_mempool *mp)
 		}
 	}
 
+free_op_data:
 	rte_free(op_data);
 	batch_op_data_set(mp->pool_id, NULL);
 	rte_wmb();
@@ -172,6 +183,9 @@ cn10k_mempool_get_count(const struct rte_mempool *mp)
 	int i;
 
 	op_data = batch_op_data_get(mp->pool_id);
+	/* If max_async_batch == 0, then batch alloc mem will be empty */
+	if (op_data->max_async_batch == 0)
+		goto npa_pool_count;
 
 	rte_wmb();
 	for (i = 0; i < RTE_MAX_LCORE; i++) {
@@ -179,19 +193,27 @@ cn10k_mempool_get_count(const struct rte_mempool *mp)
 
 		if (mem->status == BATCH_ALLOC_OP_ISSUED)
 			count += roc_npa_aura_batch_alloc_count(
-				mem->objs, BATCH_ALLOC_SZ, BATCH_ALLOC_WAIT_US);
+				mem->objs, op_data->max_async_batch,
+				BATCH_ALLOC_WAIT_US);
 
 		if (mem->status == BATCH_ALLOC_OP_DONE)
 			count += mem->sz;
 	}
 
+npa_pool_count:
 	count += cnxk_mempool_get_count(mp);
 
 	return count;
 }
 
-static int __rte_hot
-cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+static inline unsigned int __rte_hot
+mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	return cnxk_mempool_deq(mp, obj_table, n) ? 0 : n;
+}
+
+static inline unsigned int __rte_hot
+mempool_deq_batch_async(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
 	struct batch_op_data *op_data;
 	struct batch_op_mem *mem;
@@ -205,24 +227,24 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 
 	/* Issue batch alloc */
 	if (mem->status == BATCH_ALLOC_OP_NOT_ISSUED) {
-		rc = roc_npa_aura_batch_alloc_issue(mp->pool_id, mem->objs,
-						    BATCH_ALLOC_SZ, 0, 1);
+		rc = roc_npa_aura_batch_alloc_issue(
+			mp->pool_id, mem->objs, op_data->max_async_batch, 0, 1);
 		/* If issue fails, try falling back to default alloc */
 		if (unlikely(rc))
-			return cnxk_mempool_deq(mp, obj_table, n);
+			return mempool_deq(mp, obj_table, n);
 		mem->status = BATCH_ALLOC_OP_ISSUED;
 	}
 
-	retry = 4;
+	retry = BATCH_ALLOC_RETRIES;
 	while (loop) {
 		unsigned int cur_sz;
 
 		if (mem->status == BATCH_ALLOC_OP_ISSUED) {
 			mem->sz = roc_npa_aura_batch_alloc_extract(
-				mem->objs, mem->objs, BATCH_ALLOC_SZ);
+				mem->objs, mem->objs, op_data->max_async_batch);
 
 			/* If partial alloc reduce the retry count */
-			retry -= (mem->sz != BATCH_ALLOC_SZ);
+			retry -= (mem->sz != op_data->max_async_batch);
 			/* Break the loop if retry count exhausted */
 			loop = !!retry;
 			mem->status = BATCH_ALLOC_OP_DONE;
@@ -244,13 +266,72 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 		/* Issue next batch alloc if pointers are exhausted */
 		if (mem->sz == 0) {
 			rc = roc_npa_aura_batch_alloc_issue(
-				mp->pool_id, mem->objs, BATCH_ALLOC_SZ, 0, 1);
+				mp->pool_id, mem->objs,
+				op_data->max_async_batch, 0, 1);
 			/* Break loop if issue failed and set status */
 			loop &= !rc;
 			mem->status = !rc;
 		}
 	}
 
+	return count;
+}
+
+static inline unsigned int __rte_hot
+mempool_deq_batch_sync(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	struct batch_op_data *op_data;
+	struct batch_op_mem *mem;
+	unsigned int count = 0;
+	int tid, retry, rc;
+
+	op_data = batch_op_data_get(mp->pool_id);
+	tid = rte_lcore_id();
+	mem = &op_data->mem[tid];
+
+	retry = BATCH_ALLOC_RETRIES;
+	while (count != n && retry) {
+		unsigned int cur_sz, batch_sz;
+
+		cur_sz = n - count;
+		batch_sz = RTE_MIN(BATCH_ALLOC_SZ, (int)cur_sz);
+
+		/* Issue batch alloc */
+		rc = roc_npa_aura_batch_alloc_issue(mp->pool_id, mem->objs,
+						    batch_sz, 0, 1);
+
+		/* If issue fails, try falling back to default alloc */
+		if (unlikely(rc))
+			return count +
+			       mempool_deq(mp, obj_table + count, n - count);
+
+		cur_sz = roc_npa_aura_batch_alloc_extract(mem->objs, mem->objs,
+							  batch_sz);
+
+		/* Dequeue the pointers */
+		memcpy(&obj_table[count], mem->objs,
+		       cur_sz * sizeof(uintptr_t));
+		count += cur_sz;
+
+		/* If partial alloc reduce the retry count */
+		retry -= (batch_sz != cur_sz);
+	}
+
+	return count;
+}
+
+static int __rte_hot
+cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	struct batch_op_data *op_data;
+	unsigned int count = 0;
+
+	op_data = batch_op_data_get(mp->pool_id);
+	if (op_data->max_async_batch)
+		count = mempool_deq_batch_async(mp, obj_table, n);
+	else
+		count = mempool_deq_batch_sync(mp, obj_table, n);
+
 	if (unlikely(count != n)) {
 		/* No partial alloc allowed. Free up allocated pointers */
 		cn10k_mempool_enq(mp, obj_table, count);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/3] mempool/cnxk: fix free from non-EAL pthreads
  2023-08-22 17:01 ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
@ 2023-08-22 17:01   ` Ashwin Sekhar T K
  2023-08-22 17:01   ` [PATCH 3/3] mempool/cnxk: fix alloc " Ashwin Sekhar T K
  2023-08-24  4:57   ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Jerin Jacob
  2 siblings, 0 replies; 7+ messages in thread
From: Ashwin Sekhar T K @ 2023-08-22 17:01 UTC (permalink / raw)
  To: dev, Ashwin Sekhar T K, Pavan Nikhilesh
  Cc: jerinj, skori, skoteshwar, kirankumark, psatheesh, anoobj,
	gakhil, hkalra, ndabilpuram

From: Harman Kalra <hkalra@marvell.com>

For non-EAL pthreads, rte_lcore_id() will not be valid.
So, batch free cannot be used as those threads won't have
dedicated lmtlines. So, fallback to bulk alloc in such cases.

Fixes: ecbc731a2286 ("mempool/cnxk: add cn10k batch enqueue")

Signed-off-by: Harman Kalra <hkalra@marvell.com>
Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>
---
 drivers/mempool/cnxk/cn10k_mempool_ops.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c b/drivers/mempool/cnxk/cn10k_mempool_ops.c
index 41b755b52b..2e46204c8d 100644
--- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
+++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
@@ -162,6 +162,12 @@ cn10k_mempool_enq(struct rte_mempool *mp, void *const *obj_table,
 	 */
 	rte_io_wmb();
 
+	/* For non-EAL threads, rte_lcore_id() will not be valid. Hence
+	 * fallback to bulk alloc
+	 */
+	if (unlikely(rte_lcore_id() == LCORE_ID_ANY))
+		return cnxk_mempool_enq(mp, obj_table, n);
+
 	if (n == 1) {
 		roc_npa_aura_op_free(mp->pool_id, 1, ptr[0]);
 		return 0;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/3] mempool/cnxk: fix alloc from non-EAL pthreads
  2023-08-22 17:01 ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
  2023-08-22 17:01   ` [PATCH 2/3] mempool/cnxk: fix free from non-EAL pthreads Ashwin Sekhar T K
@ 2023-08-22 17:01   ` Ashwin Sekhar T K
  2023-08-24  4:57   ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Jerin Jacob
  2 siblings, 0 replies; 7+ messages in thread
From: Ashwin Sekhar T K @ 2023-08-22 17:01 UTC (permalink / raw)
  To: dev, Ashwin Sekhar T K, Pavan Nikhilesh
  Cc: jerinj, skori, skoteshwar, kirankumark, psatheesh, anoobj,
	gakhil, hkalra, ndabilpuram

For non-EAL pthreads, rte_lcore_id() will not be valid.
So, batch allocation cannot be used as we won't have a
dedicated alloc buffer for the thread. So, fallback to
bulk alloc in such cases.

Fixes: 91531e63f43b ("mempool/cnxk: add cn10k batch dequeue")

Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>
---
 drivers/mempool/cnxk/cn10k_mempool_ops.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/mempool/cnxk/cn10k_mempool_ops.c b/drivers/mempool/cnxk/cn10k_mempool_ops.c
index 2e46204c8d..2a5aad0008 100644
--- a/drivers/mempool/cnxk/cn10k_mempool_ops.c
+++ b/drivers/mempool/cnxk/cn10k_mempool_ops.c
@@ -332,6 +332,12 @@ cn10k_mempool_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 	struct batch_op_data *op_data;
 	unsigned int count = 0;
 
+	/* For non-EAL threads, rte_lcore_id() will not be valid. Hence
+	 * fallback to bulk alloc
+	 */
+	if (unlikely(rte_lcore_id() == LCORE_ID_ANY))
+		return cnxk_mempool_deq(mp, obj_table, n);
+
 	op_data = batch_op_data_get(mp->pool_id);
 	if (op_data->max_async_batch)
 		count = mempool_deq_batch_async(mp, obj_table, n);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/3] mempool/cnxk: limit usage of async allocs
  2023-08-22 17:01 ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
  2023-08-22 17:01   ` [PATCH 2/3] mempool/cnxk: fix free from non-EAL pthreads Ashwin Sekhar T K
  2023-08-22 17:01   ` [PATCH 3/3] mempool/cnxk: fix alloc " Ashwin Sekhar T K
@ 2023-08-24  4:57   ` Jerin Jacob
  2 siblings, 0 replies; 7+ messages in thread
From: Jerin Jacob @ 2023-08-24  4:57 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Pavan Nikhilesh, jerinj, psatheesh, anoobj, gakhil,
	hkalra

On Tue, Aug 22, 2023 at 10:40 PM Ashwin Sekhar T K <asekhar@marvell.com> wrote:
>
> Currently mempool_cnxk driver uses asynchronous allocation
> for all pools. This asynchronous allocation can result in
> local caching of additional 512 objects on a single core
> even when cache is disabled. This will eventually lead to
> starvation on pools where the number of objects is very less.
>
> This commit changes this logic to use asynchronous allocation on
> only those pools which have local cache enabled. Also the async buffer
> size will be RTE_ALIGN_CEIL(rte_mempool->cache_size, 16). This
> means that when cache is disabled, async alloc will be completely
> disabled and when cache is enabled, the additional caching due
> to asynchronous allocation will be limited.
>
> A limitation has been added to cnxk documentation warning the users
> to adjust the local cache sizes accordingly.
>
> Signed-off-by: Ashwin Sekhar T K <asekhar@marvell.com>

Series applied to dpdk-next-net-mrvl/for-next-net. Thanks

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-08-24  4:57 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-31  5:55 [PATCH 1/2] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
2023-07-31  5:55 ` [PATCH 2/2] mempool/cnxk: fix alloc from non-EAL pthreads Ashwin Sekhar T K
2023-08-18  9:21   ` Jerin Jacob
2023-08-22 17:01 ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Ashwin Sekhar T K
2023-08-22 17:01   ` [PATCH 2/3] mempool/cnxk: fix free from non-EAL pthreads Ashwin Sekhar T K
2023-08-22 17:01   ` [PATCH 3/3] mempool/cnxk: fix alloc " Ashwin Sekhar T K
2023-08-24  4:57   ` [PATCH 1/3] mempool/cnxk: limit usage of async allocs Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).