DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC] mempool: implement index-based per core cache
@ 2021-09-30 17:27 Dharmik Thakkar
  2021-10-01 12:36 ` Jerin Jacob
                   ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Dharmik Thakkar @ 2021-09-30 17:27 UTC (permalink / raw)
  To: Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang, Dharmik Thakkar

Current mempool per core cache implementation is based on pointer
For most architectures, each pointer consumes 64b
Replace it with index-based implementation, where in each buffer
is addressed by (pool address + index)
It will reduce memory requirements

L3Fwd performance testing reveals minor improvements in the cache
performance and no change in throughput

Micro-benchmarking the patch using mempool_perf_test shows
significant improvement with majority of the test cases

Future plan involves replacing global pool's pointer-based implementation with index-based implementation

Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 drivers/mempool/ring/rte_mempool_ring.c |  2 +-
 lib/mempool/rte_mempool.c               |  8 +++
 lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
index b1f09ff28f4d..e55913e47f21 100644
--- a/drivers/mempool/ring/rte_mempool_ring.c
+++ b/drivers/mempool/ring/rte_mempool_ring.c
@@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
 		return -rte_errno;
 
 	mp->pool_data = r;
-
+	mp->local_cache_base_addr = &r[1];
 	return 0;
 }
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 59a588425bd6..424bdb19c323 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	int ret;
 	bool need_iova_contig_obj;
 	size_t max_alloc_size = SIZE_MAX;
+	unsigned lcore_id;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		}
 	}
 
+	/* Init all default caches. */
+	if (mp->cache_size != 0) {
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+			mp->local_cache[lcore_id].local_cache_base_value =
+				*(void **)mp->local_cache_base_addr;
+	}
+
 	rte_mempool_trace_populate_default(mp);
 	return mp->size;
 
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 4235d6f0bf2b..545405c0d3ce 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -51,6 +51,8 @@
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#include <arm_neon.h>
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -91,11 +93,12 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+	void *local_cache_base_value; /**< Base value to calculate indices */
 	/*
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 } __rte_cache_aligned;
 
 /**
@@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
  * A list of memory where objects are stored
  */
 STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
-
 /**
  * Callback used to free a memory chunk
  */
@@ -244,6 +246,7 @@ struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+	void *local_cache_base_addr; /**< Reference to the base value */
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
-	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = cache->local_cache_base_value;
+	uint32_t *cache_objs = cache->objs;
+	for (i = 0; i < cache_len; i++)
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
 	cache->len = 0;
 }
 
@@ -1289,7 +1300,9 @@ static __rte_always_inline void
 __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
 
 	/* increment stat now, adding in mempool always success */
 	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
@@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 	cache_objs = &cache->objs[cache->len];
 
+	base_value = cache->local_cache_base_value;
+
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
 	/*
 	 * The cache follows the following algorithm
 	 *   1. Add the objects to the cache
@@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
-	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+
+#if defined __ARM_NEON
+	for (i = 0; i < (n & ~0x1); i+=2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+	if (n & 0x1) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#else
+	for (i = 0; i < n; i++) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
-		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
 				cache->len - cache->size);
 		cache->len = cache->size;
 	}
@@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
 	int ret;
+	uint32_t i;
 	uint32_t index, len;
-	void **cache_objs;
+	uint32_t *cache_objs;
 
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+	void *base_value = cache->local_cache_base_value;
 	cache_objs = cache->objs;
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
 		/* No. Backfill the cache first, and then fill from it */
 		uint32_t req = n + (cache->size - cache->len);
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 
 		/* How many do we require i.e. number to fill the cache + the request */
 		ret = rte_mempool_ops_dequeue_bulk(mp,
-			&cache->objs[cache->len], req);
+			temp_objs, req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
+		}
+
 		cache->len += req;
 	}
 
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
 	/* Now fill in the response ... */
+#if defined __ARM_NEON
+	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
+						len-=2, obj_table+=2) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+	}
+	if (n & 0x1)
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
-		*obj_table = cache_objs[len];
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
 
 	cache->len -= n;
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-09-30 17:27 [dpdk-dev] [RFC] mempool: implement index-based per core cache Dharmik Thakkar
@ 2021-10-01 12:36 ` Jerin Jacob
  2021-10-01 15:44   ` Honnappa Nagarahalli
  2021-10-01 21:30 ` Ananyev, Konstantin
  2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
  2 siblings, 1 reply; 52+ messages in thread
From: Jerin Jacob @ 2021-10-01 12:36 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: Olivier Matz, Andrew Rybchenko, dpdk-dev, nd,
	Honnappa Nagarahalli, Ruifeng Wang (Arm Technology China)

On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
<dharmik.thakkar@arm.com> wrote:
>
> Current mempool per core cache implementation is based on pointer
> For most architectures, each pointer consumes 64b
> Replace it with index-based implementation, where in each buffer
> is addressed by (pool address + index)
> It will reduce memory requirements
>
> L3Fwd performance testing reveals minor improvements in the cache
> performance and no change in throughput
>
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
>
> Future plan involves replacing global pool's pointer-based implementation with index-based implementation
>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>


Sane idea. Like VPP, we tried to do this for rte_graph, but not
observed much gain.
Since lcore cache is typically 512, maybe there is a gain on the mempool path.
Also, Since you are enabling only for local cache, it is good as
mempool drivers can work as-is.(i.e HW drivers works with 64bit)
I think, getting more performance numbers for various cases may be the
next step.

> ---
>  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
>  lib/mempool/rte_mempool.c               |  8 +++
>  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
>  3 files changed, 74 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
> index b1f09ff28f4d..e55913e47f21 100644
> --- a/drivers/mempool/ring/rte_mempool_ring.c
> +++ b/drivers/mempool/ring/rte_mempool_ring.c
> @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
>                 return -rte_errno;
>
>         mp->pool_data = r;
> -
> +       mp->local_cache_base_addr = &r[1];
>         return 0;
>  }
>
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 59a588425bd6..424bdb19c323 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>         int ret;
>         bool need_iova_contig_obj;
>         size_t max_alloc_size = SIZE_MAX;
> +       unsigned lcore_id;
>
>         ret = mempool_ops_alloc_once(mp);
>         if (ret != 0)
> @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>                 }
>         }
>
> +       /* Init all default caches. */
> +       if (mp->cache_size != 0) {
> +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +                       mp->local_cache[lcore_id].local_cache_base_value =
> +                               *(void **)mp->local_cache_base_addr;
> +       }
> +
>         rte_mempool_trace_populate_default(mp);
>         return mp->size;
>
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 4235d6f0bf2b..545405c0d3ce 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -51,6 +51,8 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
>
> +#include <arm_neon.h>
> +
>  #include "rte_mempool_trace_fp.h"
>
>  #ifdef __cplusplus
> @@ -91,11 +93,12 @@ struct rte_mempool_cache {
>         uint32_t size;        /**< Size of the cache */
>         uint32_t flushthresh; /**< Threshold before we flush excess elements */
>         uint32_t len;         /**< Current cache count */
> +       void *local_cache_base_value; /**< Base value to calculate indices */
>         /*
>          * Cache is allocated to this size to allow it to overflow in certain
>          * cases to avoid needless emptying of cache.
>          */
> -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>  } __rte_cache_aligned;
>
>  /**
> @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
>   * A list of memory where objects are stored
>   */
>  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> -
>  /**
>   * Callback used to free a memory chunk
>   */
> @@ -244,6 +246,7 @@ struct rte_mempool {
>         int32_t ops_index;
>
>         struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +       void *local_cache_base_addr; /**< Reference to the base value */
>
>         uint32_t populated_size;         /**< Number of populated objects. */
>         struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>         if (cache == NULL || cache->len == 0)
>                 return;
>         rte_mempool_trace_cache_flush(cache, mp);
> -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +
> +       unsigned int i;
> +       unsigned int cache_len = cache->len;
> +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +       void *base_value = cache->local_cache_base_value;
> +       uint32_t *cache_objs = cache->objs;
> +       for (i = 0; i < cache_len; i++)
> +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>         cache->len = 0;
>  }
>
> @@ -1289,7 +1300,9 @@ static __rte_always_inline void
>  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>                       unsigned int n, struct rte_mempool_cache *cache)
>  {
> -       void **cache_objs;
> +       uint32_t *cache_objs;
> +       void *base_value;
> +       uint32_t i;
>
>         /* increment stat now, adding in mempool always success */
>         __MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> @@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>
>         cache_objs = &cache->objs[cache->len];
>
> +       base_value = cache->local_cache_base_value;
> +
> +       uint64x2_t v_obj_table;
> +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +       uint32x2_t v_cache_objs;
> +
>         /*
>          * The cache follows the following algorithm
>          *   1. Add the objects to the cache
> @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>          */
>
>         /* Add elements back into the cache */
> -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +
> +#if defined __ARM_NEON
> +       for (i = 0; i < (n & ~0x1); i+=2) {
> +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +               vst1_u32(cache_objs + i, v_cache_objs);
> +       }
> +       if (n & 0x1) {
> +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +       }
> +#else
> +       for (i = 0; i < n; i++) {
> +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +       }
> +#endif
>
>         cache->len += n;
>
>         if (cache->len >= cache->flushthresh) {
> -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> +               rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>                                 cache->len - cache->size);
>                 cache->len = cache->size;
>         }
> @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>                       unsigned int n, struct rte_mempool_cache *cache)
>  {
>         int ret;
> +       uint32_t i;
>         uint32_t index, len;
> -       void **cache_objs;
> +       uint32_t *cache_objs;
>
>         /* No cache provided or cannot be satisfied from cache */
>         if (unlikely(cache == NULL || n >= cache->size))
>                 goto ring_dequeue;
>
> +       void *base_value = cache->local_cache_base_value;
>         cache_objs = cache->objs;
>
>         /* Can this be satisfied from the cache? */
>         if (cache->len < n) {
>                 /* No. Backfill the cache first, and then fill from it */
>                 uint32_t req = n + (cache->size - cache->len);
> +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>
>                 /* How many do we require i.e. number to fill the cache + the request */
>                 ret = rte_mempool_ops_dequeue_bulk(mp,
> -                       &cache->objs[cache->len], req);
> +                       temp_objs, req);
>                 if (unlikely(ret < 0)) {
>                         /*
>                          * In the off chance that we are buffer constrained,
> @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>                         goto ring_dequeue;
>                 }
>
> +               len = cache->len;
> +               for (i = 0; i < req; ++i, ++len) {
> +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
> +               }
> +
>                 cache->len += req;
>         }
>
> +       uint64x2_t v_obj_table;
> +       uint64x2_t v_cache_objs;
> +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
>         /* Now fill in the response ... */
> +#if defined __ARM_NEON
> +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> +                                               len-=2, obj_table+=2) {
> +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +       }
> +       if (n & 0x1)
> +               *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#else
>         for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> -               *obj_table = cache_objs[len];
> +               *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
>
>         cache->len -= n;
>
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-01 12:36 ` Jerin Jacob
@ 2021-10-01 15:44   ` Honnappa Nagarahalli
  2021-10-01 17:32     ` Jerin Jacob
  0 siblings, 1 reply; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-10-01 15:44 UTC (permalink / raw)
  To: Jerin Jacob, Dharmik Thakkar
  Cc: Olivier Matz, Andrew Rybchenko, dpdk-dev, nd, Ruifeng Wang, nd

<snip>

> 
> On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> <dharmik.thakkar@arm.com> wrote:
> >
> > Current mempool per core cache implementation is based on pointer For
> > most architectures, each pointer consumes 64b Replace it with
> > index-based implementation, where in each buffer is addressed by (pool
> > address + index) It will reduce memory requirements
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance and no change in throughput
> >
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> >
> > Future plan involves replacing global pool's pointer-based
> > implementation with index-based implementation
> >
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> 
> 
> Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> gain.
> Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> Also, Since you are enabling only for local cache, it is good as mempool
> drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> performance numbers for various cases may be the next step.
The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.

> 
> > ---
> >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> >  lib/mempool/rte_mempool.c               |  8 +++
> >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> >  3 files changed, 74 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > b/drivers/mempool/ring/rte_mempool_ring.c
> > index b1f09ff28f4d..e55913e47f21 100644
> > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> rg_flags)
> >                 return -rte_errno;
> >
> >         mp->pool_data = r;
> > -
> > +       mp->local_cache_base_addr = &r[1];
> >         return 0;
> >  }
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 59a588425bd6..424bdb19c323 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >         int ret;
> >         bool need_iova_contig_obj;
> >         size_t max_alloc_size = SIZE_MAX;
> > +       unsigned lcore_id;
> >
> >         ret = mempool_ops_alloc_once(mp);
> >         if (ret != 0)
> > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >                 }
> >         }
> >
> > +       /* Init all default caches. */
> > +       if (mp->cache_size != 0) {
> > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > +                               *(void **)mp->local_cache_base_addr;
> > +       }
> > +
> >         rte_mempool_trace_populate_default(mp);
> >         return mp->size;
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 4235d6f0bf2b..545405c0d3ce 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -51,6 +51,8 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#include <arm_neon.h>
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> >         uint32_t size;        /**< Size of the cache */
> >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> */
> >         uint32_t len;         /**< Current cache count */
> > +       void *local_cache_base_value; /**< Base value to calculate
> > + indices */
> >         /*
> >          * Cache is allocated to this size to allow it to overflow in certain
> >          * cases to avoid needless emptying of cache.
> >          */
> > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> */
> > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > + objects */
> >  } __rte_cache_aligned;
> >
> >  /**
> > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> >   * A list of memory where objects are stored
> >   */
> >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > -
> >  /**
> >   * Callback used to free a memory chunk
> >   */
> > @@ -244,6 +246,7 @@ struct rte_mempool {
> >         int32_t ops_index;
> >
> >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > cache */
> > +       void *local_cache_base_addr; /**< Reference to the base value
> > + */
> >
> >         uint32_t populated_size;         /**< Number of populated objects. */
> >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >         if (cache == NULL || cache->len == 0)
> >                 return;
> >         rte_mempool_trace_cache_flush(cache, mp);
> > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > +
> > +       unsigned int i;
> > +       unsigned int cache_len = cache->len;
> > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +       void *base_value = cache->local_cache_base_value;
> > +       uint32_t *cache_objs = cache->objs;
> > +       for (i = 0; i < cache_len; i++)
> > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> >         cache->len = 0;
> >  }
> >
> > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >                       unsigned int n, struct rte_mempool_cache *cache)
> > {
> > -       void **cache_objs;
> > +       uint32_t *cache_objs;
> > +       void *base_value;
> > +       uint32_t i;
> >
> >         /* increment stat now, adding in mempool always success */
> >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >
> >         cache_objs = &cache->objs[cache->len];
> >
> > +       base_value = cache->local_cache_base_value;
> > +
> > +       uint64x2_t v_obj_table;
> > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +       uint32x2_t v_cache_objs;
> > +
> >         /*
> >          * The cache follows the following algorithm
> >          *   1. Add the objects to the cache
> > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >          */
> >
> >         /* Add elements back into the cache */
> > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > +
> > +#if defined __ARM_NEON
> > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> v_base_value));
> > +               vst1_u32(cache_objs + i, v_cache_objs);
> > +       }
> > +       if (n & 0x1) {
> > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > +       }
> > +#else
> > +       for (i = 0; i < n; i++) {
> > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > +       }
> > +#endif
> >
> >         cache->len += n;
> >
> >         if (cache->len >= cache->flushthresh) {
> > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > + cache->len - cache->size,
> >                                 cache->len - cache->size);
> >                 cache->len = cache->size;
> >         }
> > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >                       unsigned int n, struct rte_mempool_cache *cache)
> > {
> >         int ret;
> > +       uint32_t i;
> >         uint32_t index, len;
> > -       void **cache_objs;
> > +       uint32_t *cache_objs;
> >
> >         /* No cache provided or cannot be satisfied from cache */
> >         if (unlikely(cache == NULL || n >= cache->size))
> >                 goto ring_dequeue;
> >
> > +       void *base_value = cache->local_cache_base_value;
> >         cache_objs = cache->objs;
> >
> >         /* Can this be satisfied from the cache? */
> >         if (cache->len < n) {
> >                 /* No. Backfill the cache first, and then fill from it */
> >                 uint32_t req = n + (cache->size - cache->len);
> > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > + Cache objects */
> >
> >                 /* How many do we require i.e. number to fill the cache + the
> request */
> >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > -                       &cache->objs[cache->len], req);
> > +                       temp_objs, req);
> >                 if (unlikely(ret < 0)) {
> >                         /*
> >                          * In the off chance that we are buffer
> > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> rte_mempool *mp, void **obj_table,
> >                         goto ring_dequeue;
> >                 }
> >
> > +               len = cache->len;
> > +               for (i = 0; i < req; ++i, ++len) {
> > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> base_value);
> > +               }
> > +
> >                 cache->len += req;
> >         }
> >
> > +       uint64x2_t v_obj_table;
> > +       uint64x2_t v_cache_objs;
> > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +
> >         /* Now fill in the response ... */
> > +#if defined __ARM_NEON
> > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > +                                               len-=2, obj_table+=2) {
> > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > +       }
> > +       if (n & 0x1)
> > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > +cache_objs[len]); #else
> >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> > -               *obj_table = cache_objs[len];
> > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > +cache_objs[len]); #endif
> >
> >         cache->len -= n;
> >
> > --
> > 2.17.1
> >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-01 15:44   ` Honnappa Nagarahalli
@ 2021-10-01 17:32     ` Jerin Jacob
  2021-10-01 17:57       ` Honnappa Nagarahalli
  2021-10-01 18:21       ` Jerin Jacob
  0 siblings, 2 replies; 52+ messages in thread
From: Jerin Jacob @ 2021-10-01 17:32 UTC (permalink / raw)
  To: Honnappa Nagarahalli
  Cc: Dharmik Thakkar, Olivier Matz, Andrew Rybchenko, dpdk-dev, nd,
	Ruifeng Wang

On Fri, Oct 1, 2021 at 9:14 PM Honnappa Nagarahalli
<Honnappa.Nagarahalli@arm.com> wrote:
>
> <snip>
>
> >
> > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > <dharmik.thakkar@arm.com> wrote:
> > >
> > > Current mempool per core cache implementation is based on pointer For
> > > most architectures, each pointer consumes 64b Replace it with
> > > index-based implementation, where in each buffer is addressed by (pool
> > > address + index) It will reduce memory requirements
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance and no change in throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > improvement with majority of the test cases
> > >
> > > Future plan involves replacing global pool's pointer-based
> > > implementation with index-based implementation
> > >
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> >
> >
> > Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> > gain.
> > Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> > Also, Since you are enabling only for local cache, it is good as mempool
> > drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> > performance numbers for various cases may be the next step.
> The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.

OK. IMO, If PPS has regression then this path is not viable, else it may be OK.


>
> >
> > > ---
> > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > >  lib/mempool/rte_mempool.c               |  8 +++
> > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > index b1f09ff28f4d..e55913e47f21 100644
> > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > rg_flags)
> > >                 return -rte_errno;
> > >
> > >         mp->pool_data = r;
> > > -
> > > +       mp->local_cache_base_addr = &r[1];
> > >         return 0;
> > >  }
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 59a588425bd6..424bdb19c323 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >         int ret;
> > >         bool need_iova_contig_obj;
> > >         size_t max_alloc_size = SIZE_MAX;
> > > +       unsigned lcore_id;
> > >
> > >         ret = mempool_ops_alloc_once(mp);
> > >         if (ret != 0)
> > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >                 }
> > >         }
> > >
> > > +       /* Init all default caches. */
> > > +       if (mp->cache_size != 0) {
> > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > +                               *(void **)mp->local_cache_base_addr;
> > > +       }
> > > +
> > >         rte_mempool_trace_populate_default(mp);
> > >         return mp->size;
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -51,6 +51,8 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#include <arm_neon.h>
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > >         uint32_t size;        /**< Size of the cache */
> > >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> > */
> > >         uint32_t len;         /**< Current cache count */
> > > +       void *local_cache_base_value; /**< Base value to calculate
> > > + indices */
> > >         /*
> > >          * Cache is allocated to this size to allow it to overflow in certain
> > >          * cases to avoid needless emptying of cache.
> > >          */
> > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> > */
> > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > + objects */
> > >  } __rte_cache_aligned;
> > >
> > >  /**
> > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > >   * A list of memory where objects are stored
> > >   */
> > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > -
> > >  /**
> > >   * Callback used to free a memory chunk
> > >   */
> > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > >         int32_t ops_index;
> > >
> > >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > > cache */
> > > +       void *local_cache_base_addr; /**< Reference to the base value
> > > + */
> > >
> > >         uint32_t populated_size;         /**< Number of populated objects. */
> > >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> > >         if (cache == NULL || cache->len == 0)
> > >                 return;
> > >         rte_mempool_trace_cache_flush(cache, mp);
> > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > +
> > > +       unsigned int i;
> > > +       unsigned int cache_len = cache->len;
> > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +       void *base_value = cache->local_cache_base_value;
> > > +       uint32_t *cache_objs = cache->objs;
> > > +       for (i = 0; i < cache_len; i++)
> > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > >         cache->len = 0;
> > >  }
> > >
> > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > {
> > > -       void **cache_objs;
> > > +       uint32_t *cache_objs;
> > > +       void *base_value;
> > > +       uint32_t i;
> > >
> > >         /* increment stat now, adding in mempool always success */
> > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >
> > >         cache_objs = &cache->objs[cache->len];
> > >
> > > +       base_value = cache->local_cache_base_value;
> > > +
> > > +       uint64x2_t v_obj_table;
> > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +       uint32x2_t v_cache_objs;
> > > +
> > >         /*
> > >          * The cache follows the following algorithm
> > >          *   1. Add the objects to the cache
> > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > *mp, void * const *obj_table,
> > >          */
> > >
> > >         /* Add elements back into the cache */
> > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > +
> > > +#if defined __ARM_NEON
> > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > v_base_value));
> > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > +       }
> > > +       if (n & 0x1) {
> > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > +       }
> > > +#else
> > > +       for (i = 0; i < n; i++) {
> > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > +       }
> > > +#endif
> > >
> > >         cache->len += n;
> > >
> > >         if (cache->len >= cache->flushthresh) {
> > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > + cache->len - cache->size,
> > >                                 cache->len - cache->size);
> > >                 cache->len = cache->size;
> > >         }
> > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > *mp, void **obj_table,
> > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > {
> > >         int ret;
> > > +       uint32_t i;
> > >         uint32_t index, len;
> > > -       void **cache_objs;
> > > +       uint32_t *cache_objs;
> > >
> > >         /* No cache provided or cannot be satisfied from cache */
> > >         if (unlikely(cache == NULL || n >= cache->size))
> > >                 goto ring_dequeue;
> > >
> > > +       void *base_value = cache->local_cache_base_value;
> > >         cache_objs = cache->objs;
> > >
> > >         /* Can this be satisfied from the cache? */
> > >         if (cache->len < n) {
> > >                 /* No. Backfill the cache first, and then fill from it */
> > >                 uint32_t req = n + (cache->size - cache->len);
> > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > > + Cache objects */
> > >
> > >                 /* How many do we require i.e. number to fill the cache + the
> > request */
> > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > -                       &cache->objs[cache->len], req);
> > > +                       temp_objs, req);
> > >                 if (unlikely(ret < 0)) {
> > >                         /*
> > >                          * In the off chance that we are buffer
> > > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> > rte_mempool *mp, void **obj_table,
> > >                         goto ring_dequeue;
> > >                 }
> > >
> > > +               len = cache->len;
> > > +               for (i = 0; i < req; ++i, ++len) {
> > > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> > base_value);
> > > +               }
> > > +
> > >                 cache->len += req;
> > >         }
> > >
> > > +       uint64x2_t v_obj_table;
> > > +       uint64x2_t v_cache_objs;
> > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +
> > >         /* Now fill in the response ... */
> > > +#if defined __ARM_NEON
> > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > +                                               len-=2, obj_table+=2) {
> > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > +       }
> > > +       if (n & 0x1)
> > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > +cache_objs[len]); #else
> > >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > obj_table++)
> > > -               *obj_table = cache_objs[len];
> > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > +cache_objs[len]); #endif
> > >
> > >         cache->len -= n;
> > >
> > > --
> > > 2.17.1
> > >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-01 17:32     ` Jerin Jacob
@ 2021-10-01 17:57       ` Honnappa Nagarahalli
  2021-10-01 18:21       ` Jerin Jacob
  1 sibling, 0 replies; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-10-01 17:57 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Dharmik Thakkar, Olivier Matz, Andrew Rybchenko, dpdk-dev, nd,
	Ruifeng Wang, nd

<snip>
> >
> > >
> > > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > > <dharmik.thakkar@arm.com> wrote:
> > > >
> > > > Current mempool per core cache implementation is based on pointer
> > > > For most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by
> > > > (pool address + index) It will reduce memory requirements
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance and no change in throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows
> > > > significant improvement with majority of the test cases
> > > >
> > > > Future plan involves replacing global pool's pointer-based
> > > > implementation with index-based implementation
> > > >
> > > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > >
> > >
> > > Sane idea. Like VPP, we tried to do this for rte_graph, but not
> > > observed much gain.
> > > Since lcore cache is typically 512, maybe there is a gain on the mempool
> path.
> > > Also, Since you are enabling only for local cache, it is good as
> > > mempool drivers can work as-is.(i.e HW drivers works with 64bit) I
> > > think, getting more performance numbers for various cases may be the
> next step.
> > The gain is not observed in terms of PPS improvement, but do see some
> improvements that PMUs indicate. This approach definitely results in savings
> in number of cache lines utilized.
> 
> OK. IMO, If PPS has regression then this path is not viable, else it may be OK.
PPS has not regressed. It has improved, but not significantly.
Other way to look at this is, we are doing the same work with less amount of resources.

> 
> 
> >
> > >
> > > > ---
> > > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > > >  lib/mempool/rte_mempool.c               |  8 +++
> > > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > > index b1f09ff28f4d..e55913e47f21 100644
> > > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > > rg_flags)
> > > >                 return -rte_errno;
> > > >
> > > >         mp->pool_data = r;
> > > > -
> > > > +       mp->local_cache_base_addr = &r[1];
> > > >         return 0;
> > > >  }
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > > index 59a588425bd6..424bdb19c323 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >         int ret;
> > > >         bool need_iova_contig_obj;
> > > >         size_t max_alloc_size = SIZE_MAX;
> > > > +       unsigned lcore_id;
> > > >
> > > >         ret = mempool_ops_alloc_once(mp);
> > > >         if (ret != 0)
> > > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Init all default caches. */
> > > > +       if (mp->cache_size != 0) {
> > > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > > +                               *(void **)mp->local_cache_base_addr;
> > > > +       }
> > > > +
> > > >         rte_mempool_trace_populate_default(mp);
> > > >         return mp->size;
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -51,6 +51,8 @@
> > > >  #include <rte_memcpy.h>
> > > >  #include <rte_common.h>
> > > >
> > > > +#include <arm_neon.h>
> > > > +
> > > >  #include "rte_mempool_trace_fp.h"
> > > >
> > > >  #ifdef __cplusplus
> > > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > > >         uint32_t size;        /**< Size of the cache */
> > > >         uint32_t flushthresh; /**< Threshold before we flush
> > > > excess elements
> > > */
> > > >         uint32_t len;         /**< Current cache count */
> > > > +       void *local_cache_base_value; /**< Base value to calculate
> > > > + indices */
> > > >         /*
> > > >          * Cache is allocated to this size to allow it to overflow in certain
> > > >          * cases to avoid needless emptying of cache.
> > > >          */
> > > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects
> > > */
> > > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > > + objects */
> > > >  } __rte_cache_aligned;
> > > >
> > > >  /**
> > > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > > >   * A list of memory where objects are stored
> > > >   */
> > > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > > -
> > > >  /**
> > > >   * Callback used to free a memory chunk
> > > >   */
> > > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > > >         int32_t ops_index;
> > > >
> > > >         struct rte_mempool_cache *local_cache; /**< Per-lcore
> > > > local cache */
> > > > +       void *local_cache_base_addr; /**< Reference to the base
> > > > + value */
> > > >
> > > >         uint32_t populated_size;         /**< Number of populated objects. */
> > > >         struct rte_mempool_objhdr_list elt_list; /**< List of
> > > > objects in pool */ @@ -1269,7 +1272,15 @@
> > > > rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > > >         if (cache == NULL || cache->len == 0)
> > > >                 return;
> > > >         rte_mempool_trace_cache_flush(cache, mp);
> > > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > > +
> > > > +       unsigned int i;
> > > > +       unsigned int cache_len = cache->len;
> > > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > +       void *base_value = cache->local_cache_base_value;
> > > > +       uint32_t *cache_objs = cache->objs;
> > > > +       for (i = 0; i < cache_len; i++)
> > > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> cache_objs[i]);
> > > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > > >         cache->len = 0;
> > > >  }
> > > >
> > > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const
> *obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache
> > > > *cache) {
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > > +       void *base_value;
> > > > +       uint32_t i;
> > > >
> > > >         /* increment stat now, adding in mempool always success */
> > > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> > > > @@ __mempool_generic_put(struct rte_mempool *mp, void * const
> > > > *obj_table,
> > > >
> > > >         cache_objs = &cache->objs[cache->len];
> > > >
> > > > +       base_value = cache->local_cache_base_value;
> > > > +
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +       uint32x2_t v_cache_objs;
> > > > +
> > > >         /*
> > > >          * The cache follows the following algorithm
> > > >          *   1. Add the objects to the cache
> > > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct
> rte_mempool
> > > *mp, void * const *obj_table,
> > > >          */
> > > >
> > > >         /* Add elements back into the cache */
> > > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > > +
> > > > +#if defined __ARM_NEON
> > > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > > v_base_value));
> > > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > > +       }
> > > > +       if (n & 0x1) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > > > +       }
> > > > +#else
> > > > +       for (i = 0; i < n; i++) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > > > +       }
> > > > +#endif
> > > >
> > > >         cache->len += n;
> > > >
> > > >         if (cache->len >= cache->flushthresh) {
> > > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> >size],
> > > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > > + cache->len - cache->size,
> > > >                                 cache->len - cache->size);
> > > >                 cache->len = cache->size;
> > > >         }
> > > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct
> rte_mempool
> > > *mp, void **obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache
> > > > *cache) {
> > > >         int ret;
> > > > +       uint32_t i;
> > > >         uint32_t index, len;
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > >
> > > >         /* No cache provided or cannot be satisfied from cache */
> > > >         if (unlikely(cache == NULL || n >= cache->size))
> > > >                 goto ring_dequeue;
> > > >
> > > > +       void *base_value = cache->local_cache_base_value;
> > > >         cache_objs = cache->objs;
> > > >
> > > >         /* Can this be satisfied from the cache? */
> > > >         if (cache->len < n) {
> > > >                 /* No. Backfill the cache first, and then fill from it */
> > > >                 uint32_t req = n + (cache->size - cache->len);
> > > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > + /**< Cache objects */
> > > >
> > > >                 /* How many do we require i.e. number to fill the
> > > > cache + the
> > > request */
> > > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > > -                       &cache->objs[cache->len], req);
> > > > +                       temp_objs, req);
> > > >                 if (unlikely(ret < 0)) {
> > > >                         /*
> > > >                          * In the off chance that we are buffer
> > > > constrained, @@ -1442,12 +1478,32 @@
> __mempool_generic_get(struct
> > > rte_mempool *mp, void **obj_table,
> > > >                         goto ring_dequeue;
> > > >                 }
> > > >
> > > > +               len = cache->len;
> > > > +               for (i = 0; i < req; ++i, ++len) {
> > > > +                       cache_objs[len] = (uint32_t)
> > > > + RTE_PTR_DIFF(temp_objs[i],
> > > base_value);
> > > > +               }
> > > > +
> > > >                 cache->len += req;
> > > >         }
> > > >
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_cache_objs;
> > > > +       uint64x2_t v_base_value =
> > > > + vdupq_n_u64((uint64_t)base_value);
> > > > +
> > > >         /* Now fill in the response ... */
> > > > +#if defined __ARM_NEON
> > > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > > +                                               len-=2, obj_table+=2) {
> > > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > > +       }
> > > > +       if (n & 0x1)
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #else
> > > >         for (index = 0, len = cache->len - 1; index < n; ++index,
> > > > len--,
> > > obj_table++)
> > > > -               *obj_table = cache_objs[len];
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #endif
> > > >
> > > >         cache->len -= n;
> > > >
> > > > --
> > > > 2.17.1
> > > >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-01 17:32     ` Jerin Jacob
  2021-10-01 17:57       ` Honnappa Nagarahalli
@ 2021-10-01 18:21       ` Jerin Jacob
  1 sibling, 0 replies; 52+ messages in thread
From: Jerin Jacob @ 2021-10-01 18:21 UTC (permalink / raw)
  To: Honnappa Nagarahalli
  Cc: Dharmik Thakkar, Olivier Matz, Andrew Rybchenko, dpdk-dev, nd,
	Ruifeng Wang

On Fri, Oct 1, 2021 at 11:02 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> On Fri, Oct 1, 2021 at 9:14 PM Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com> wrote:
> >
> > <snip>
> >
> > >
> > > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > > <dharmik.thakkar@arm.com> wrote:
> > > >
> > > > Current mempool per core cache implementation is based on pointer For
> > > > most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by (pool
> > > > address + index) It will reduce memory requirements
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance and no change in throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > > improvement with majority of the test cases
> > > >
> > > > Future plan involves replacing global pool's pointer-based
> > > > implementation with index-based implementation
> > > >
> > > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > >
> > >
> > > Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> > > gain.
> > > Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> > > Also, Since you are enabling only for local cache, it is good as mempool
> > > drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> > > performance numbers for various cases may be the next step.
> > The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.
>
> OK. IMO, If PPS has regression then this path is not viable, else it may be OK.

Looks good then.

>
>
> >
> > >
> > > > ---
> > > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > > >  lib/mempool/rte_mempool.c               |  8 +++
> > > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > > index b1f09ff28f4d..e55913e47f21 100644
> > > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > > rg_flags)
> > > >                 return -rte_errno;
> > > >
> > > >         mp->pool_data = r;
> > > > -
> > > > +       mp->local_cache_base_addr = &r[1];
> > > >         return 0;
> > > >  }
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > > index 59a588425bd6..424bdb19c323 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >         int ret;
> > > >         bool need_iova_contig_obj;
> > > >         size_t max_alloc_size = SIZE_MAX;
> > > > +       unsigned lcore_id;
> > > >
> > > >         ret = mempool_ops_alloc_once(mp);
> > > >         if (ret != 0)
> > > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Init all default caches. */
> > > > +       if (mp->cache_size != 0) {
> > > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > > +                               *(void **)mp->local_cache_base_addr;
> > > > +       }
> > > > +
> > > >         rte_mempool_trace_populate_default(mp);
> > > >         return mp->size;
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -51,6 +51,8 @@
> > > >  #include <rte_memcpy.h>
> > > >  #include <rte_common.h>
> > > >
> > > > +#include <arm_neon.h>
> > > > +
> > > >  #include "rte_mempool_trace_fp.h"
> > > >
> > > >  #ifdef __cplusplus
> > > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > > >         uint32_t size;        /**< Size of the cache */
> > > >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> > > */
> > > >         uint32_t len;         /**< Current cache count */
> > > > +       void *local_cache_base_value; /**< Base value to calculate
> > > > + indices */
> > > >         /*
> > > >          * Cache is allocated to this size to allow it to overflow in certain
> > > >          * cases to avoid needless emptying of cache.
> > > >          */
> > > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> > > */
> > > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > > + objects */
> > > >  } __rte_cache_aligned;
> > > >
> > > >  /**
> > > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > > >   * A list of memory where objects are stored
> > > >   */
> > > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > > -
> > > >  /**
> > > >   * Callback used to free a memory chunk
> > > >   */
> > > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > > >         int32_t ops_index;
> > > >
> > > >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > > > cache */
> > > > +       void *local_cache_base_addr; /**< Reference to the base value
> > > > + */
> > > >
> > > >         uint32_t populated_size;         /**< Number of populated objects. */
> > > >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > > > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > > >         if (cache == NULL || cache->len == 0)
> > > >                 return;
> > > >         rte_mempool_trace_cache_flush(cache, mp);
> > > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > > +
> > > > +       unsigned int i;
> > > > +       unsigned int cache_len = cache->len;
> > > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > +       void *base_value = cache->local_cache_base_value;
> > > > +       uint32_t *cache_objs = cache->objs;
> > > > +       for (i = 0; i < cache_len; i++)
> > > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > > >         cache->len = 0;
> > > >  }
> > > >
> > > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > > {
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > > +       void *base_value;
> > > > +       uint32_t i;
> > > >
> > > >         /* increment stat now, adding in mempool always success */
> > > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > > >
> > > >         cache_objs = &cache->objs[cache->len];
> > > >
> > > > +       base_value = cache->local_cache_base_value;
> > > > +
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +       uint32x2_t v_cache_objs;
> > > > +
> > > >         /*
> > > >          * The cache follows the following algorithm
> > > >          *   1. Add the objects to the cache
> > > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > > *mp, void * const *obj_table,
> > > >          */
> > > >
> > > >         /* Add elements back into the cache */
> > > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > > +
> > > > +#if defined __ARM_NEON
> > > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > > v_base_value));
> > > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > > +       }
> > > > +       if (n & 0x1) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > > +       }
> > > > +#else
> > > > +       for (i = 0; i < n; i++) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > > +       }
> > > > +#endif
> > > >
> > > >         cache->len += n;
> > > >
> > > >         if (cache->len >= cache->flushthresh) {
> > > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > > + cache->len - cache->size,
> > > >                                 cache->len - cache->size);
> > > >                 cache->len = cache->size;
> > > >         }
> > > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > > *mp, void **obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > > {
> > > >         int ret;
> > > > +       uint32_t i;
> > > >         uint32_t index, len;
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > >
> > > >         /* No cache provided or cannot be satisfied from cache */
> > > >         if (unlikely(cache == NULL || n >= cache->size))
> > > >                 goto ring_dequeue;
> > > >
> > > > +       void *base_value = cache->local_cache_base_value;
> > > >         cache_objs = cache->objs;
> > > >
> > > >         /* Can this be satisfied from the cache? */
> > > >         if (cache->len < n) {
> > > >                 /* No. Backfill the cache first, and then fill from it */
> > > >                 uint32_t req = n + (cache->size - cache->len);
> > > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > > > + Cache objects */
> > > >
> > > >                 /* How many do we require i.e. number to fill the cache + the
> > > request */
> > > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > > -                       &cache->objs[cache->len], req);
> > > > +                       temp_objs, req);
> > > >                 if (unlikely(ret < 0)) {
> > > >                         /*
> > > >                          * In the off chance that we are buffer
> > > > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> > > rte_mempool *mp, void **obj_table,
> > > >                         goto ring_dequeue;
> > > >                 }
> > > >
> > > > +               len = cache->len;
> > > > +               for (i = 0; i < req; ++i, ++len) {
> > > > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> > > base_value);
> > > > +               }
> > > > +
> > > >                 cache->len += req;
> > > >         }
> > > >
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_cache_objs;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +
> > > >         /* Now fill in the response ... */
> > > > +#if defined __ARM_NEON
> > > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > > +                                               len-=2, obj_table+=2) {
> > > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > > +       }
> > > > +       if (n & 0x1)
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #else
> > > >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > > obj_table++)
> > > > -               *obj_table = cache_objs[len];
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #endif
> > > >
> > > >         cache->len -= n;
> > > >
> > > > --
> > > > 2.17.1
> > > >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-09-30 17:27 [dpdk-dev] [RFC] mempool: implement index-based per core cache Dharmik Thakkar
  2021-10-01 12:36 ` Jerin Jacob
@ 2021-10-01 21:30 ` Ananyev, Konstantin
  2021-10-02  0:07   ` Honnappa Nagarahalli
  2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
  2 siblings, 1 reply; 52+ messages in thread
From: Ananyev, Konstantin @ 2021-10-01 21:30 UTC (permalink / raw)
  To: Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang

> Current mempool per core cache implementation is based on pointer
> For most architectures, each pointer consumes 64b
> Replace it with index-based implementation, where in each buffer
> is addressed by (pool address + index)

I don't think it is going to work:
On 64-bit systems difference between pool address and it's elem address
could be bigger than 4GB.
 
> It will reduce memory requirements
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance and no change in throughput
> 
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
> 
> Future plan involves replacing global pool's pointer-based implementation with index-based implementation
> 
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> ---
>  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
>  lib/mempool/rte_mempool.c               |  8 +++
>  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
>  3 files changed, 74 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
> index b1f09ff28f4d..e55913e47f21 100644
> --- a/drivers/mempool/ring/rte_mempool_ring.c
> +++ b/drivers/mempool/ring/rte_mempool_ring.c
> @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
>  		return -rte_errno;
> 
>  	mp->pool_data = r;
> -
> +	mp->local_cache_base_addr = &r[1];
>  	return 0;
>  }
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 59a588425bd6..424bdb19c323 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  	int ret;
>  	bool need_iova_contig_obj;
>  	size_t max_alloc_size = SIZE_MAX;
> +	unsigned lcore_id;
> 
>  	ret = mempool_ops_alloc_once(mp);
>  	if (ret != 0)
> @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  		}
>  	}
> 
> +	/* Init all default caches. */
> +	if (mp->cache_size != 0) {
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +			mp->local_cache[lcore_id].local_cache_base_value =
> +				*(void **)mp->local_cache_base_addr;
> +	}
> +
>  	rte_mempool_trace_populate_default(mp);
>  	return mp->size;
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 4235d6f0bf2b..545405c0d3ce 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -51,6 +51,8 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
> 
> +#include <arm_neon.h>
> +
>  #include "rte_mempool_trace_fp.h"
> 
>  #ifdef __cplusplus
> @@ -91,11 +93,12 @@ struct rte_mempool_cache {
>  	uint32_t size;	      /**< Size of the cache */
>  	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>  	uint32_t len;	      /**< Current cache count */
> +	void *local_cache_base_value; /**< Base value to calculate indices */
>  	/*
>  	 * Cache is allocated to this size to allow it to overflow in certain
>  	 * cases to avoid needless emptying of cache.
>  	 */
> -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>  } __rte_cache_aligned;
> 
>  /**
> @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
>   * A list of memory where objects are stored
>   */
>  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> -
>  /**
>   * Callback used to free a memory chunk
>   */
> @@ -244,6 +246,7 @@ struct rte_mempool {
>  	int32_t ops_index;
> 
>  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +	void *local_cache_base_addr; /**< Reference to the base value */
> 
>  	uint32_t populated_size;         /**< Number of populated objects. */
>  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>  	if (cache == NULL || cache->len == 0)
>  		return;
>  	rte_mempool_trace_cache_flush(cache, mp);
> -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +
> +	unsigned int i;
> +	unsigned int cache_len = cache->len;
> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +	void *base_value = cache->local_cache_base_value;
> +	uint32_t *cache_objs = cache->objs;
> +	for (i = 0; i < cache_len; i++)
> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>  	cache->len = 0;
>  }
> 
> @@ -1289,7 +1300,9 @@ static __rte_always_inline void
>  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  		      unsigned int n, struct rte_mempool_cache *cache)
>  {
> -	void **cache_objs;
> +	uint32_t *cache_objs;
> +	void *base_value;
> +	uint32_t i;
> 
>  	/* increment stat now, adding in mempool always success */
>  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> @@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> 
>  	cache_objs = &cache->objs[cache->len];
> 
> +	base_value = cache->local_cache_base_value;
> +
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +	uint32x2_t v_cache_objs;
> +
>  	/*
>  	 * The cache follows the following algorithm
>  	 *   1. Add the objects to the cache
> @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	 */
> 
>  	/* Add elements back into the cache */
> -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +
> +#if defined __ARM_NEON
> +	for (i = 0; i < (n & ~0x1); i+=2) {
> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +		vst1_u32(cache_objs + i, v_cache_objs);
> +	}
> +	if (n & 0x1) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#else
> +	for (i = 0; i < n; i++) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#endif
> 
>  	cache->len += n;
> 
>  	if (cache->len >= cache->flushthresh) {
> -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>  				cache->len - cache->size);
>  		cache->len = cache->size;
>  	}
> @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>  		      unsigned int n, struct rte_mempool_cache *cache)
>  {
>  	int ret;
> +	uint32_t i;
>  	uint32_t index, len;
> -	void **cache_objs;
> +	uint32_t *cache_objs;
> 
>  	/* No cache provided or cannot be satisfied from cache */
>  	if (unlikely(cache == NULL || n >= cache->size))
>  		goto ring_dequeue;
> 
> +	void *base_value = cache->local_cache_base_value;
>  	cache_objs = cache->objs;
> 
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < n) {
>  		/* No. Backfill the cache first, and then fill from it */
>  		uint32_t req = n + (cache->size - cache->len);
> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> 
>  		/* How many do we require i.e. number to fill the cache + the request */
>  		ret = rte_mempool_ops_dequeue_bulk(mp,
> -			&cache->objs[cache->len], req);
> +			temp_objs, req);
>  		if (unlikely(ret < 0)) {
>  			/*
>  			 * In the off chance that we are buffer constrained,
> @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>  			goto ring_dequeue;
>  		}
> 
> +		len = cache->len;
> +		for (i = 0; i < req; ++i, ++len) {
> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
> +		}
> +
>  		cache->len += req;
>  	}
> 
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_cache_objs;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
>  	/* Now fill in the response ... */
> +#if defined __ARM_NEON
> +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> +						len-=2, obj_table+=2) {
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +	}
> +	if (n & 0x1)
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#else
>  	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> -		*obj_table = cache_objs[len];
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> 
>  	cache->len -= n;
> 
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-01 21:30 ` Ananyev, Konstantin
@ 2021-10-02  0:07   ` Honnappa Nagarahalli
  2021-10-02 18:51     ` Ananyev, Konstantin
  0 siblings, 1 reply; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-10-02  0:07 UTC (permalink / raw)
  To: Ananyev, Konstantin, Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, Ruifeng Wang, nd

<snip>
> 
> > Current mempool per core cache implementation is based on pointer For
> > most architectures, each pointer consumes 64b Replace it with
> > index-based implementation, where in each buffer is addressed by (pool
> > address + index)
> 
> I don't think it is going to work:
> On 64-bit systems difference between pool address and it's elem address
> could be bigger than 4GB.
Are you talking about a case where the memory pool size is more than 4GB?

> 
> > It will reduce memory requirements
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance and no change in throughput
> >
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> >
> > Future plan involves replacing global pool's pointer-based
> > implementation with index-based implementation
> >
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > ---
> >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> >  lib/mempool/rte_mempool.c               |  8 +++
> >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> >  3 files changed, 74 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > b/drivers/mempool/ring/rte_mempool_ring.c
> > index b1f09ff28f4d..e55913e47f21 100644
> > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> rg_flags)
> >  		return -rte_errno;
> >
> >  	mp->pool_data = r;
> > -
> > +	mp->local_cache_base_addr = &r[1];
> >  	return 0;
> >  }
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 59a588425bd6..424bdb19c323 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >  	int ret;
> >  	bool need_iova_contig_obj;
> >  	size_t max_alloc_size = SIZE_MAX;
> > +	unsigned lcore_id;
> >
> >  	ret = mempool_ops_alloc_once(mp);
> >  	if (ret != 0)
> > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >  		}
> >  	}
> >
> > +	/* Init all default caches. */
> > +	if (mp->cache_size != 0) {
> > +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > +			mp->local_cache[lcore_id].local_cache_base_value =
> > +				*(void **)mp->local_cache_base_addr;
> > +	}
> > +
> >  	rte_mempool_trace_populate_default(mp);
> >  	return mp->size;
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 4235d6f0bf2b..545405c0d3ce 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -51,6 +51,8 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#include <arm_neon.h>
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> >  	uint32_t size;	      /**< Size of the cache */
> >  	uint32_t flushthresh; /**< Threshold before we flush excess elements
> */
> >  	uint32_t len;	      /**< Current cache count */
> > +	void *local_cache_base_value; /**< Base value to calculate indices
> > +*/
> >  	/*
> >  	 * Cache is allocated to this size to allow it to overflow in certain
> >  	 * cases to avoid needless emptying of cache.
> >  	 */
> > -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> > +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> >  } __rte_cache_aligned;
> >
> >  /**
> > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> >   * A list of memory where objects are stored
> >   */
> >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > -
> >  /**
> >   * Callback used to free a memory chunk
> >   */
> > @@ -244,6 +246,7 @@ struct rte_mempool {
> >  	int32_t ops_index;
> >
> >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > +	void *local_cache_base_addr; /**< Reference to the base value */
> >
> >  	uint32_t populated_size;         /**< Number of populated objects. */
> >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >  	if (cache == NULL || cache->len == 0)
> >  		return;
> >  	rte_mempool_trace_cache_flush(cache, mp);
> > -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > +
> > +	unsigned int i;
> > +	unsigned int cache_len = cache->len;
> > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +	void *base_value = cache->local_cache_base_value;
> > +	uint32_t *cache_objs = cache->objs;
> > +	for (i = 0; i < cache_len; i++)
> > +		obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> cache_objs[i]);
> > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> >  	cache->len = 0;
> >  }
> >
> > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > -	void **cache_objs;
> > +	uint32_t *cache_objs;
> > +	void *base_value;
> > +	uint32_t i;
> >
> >  	/* increment stat now, adding in mempool always success */
> >  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> @@
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >
> >  	cache_objs = &cache->objs[cache->len];
> >
> > +	base_value = cache->local_cache_base_value;
> > +
> > +	uint64x2_t v_obj_table;
> > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +	uint32x2_t v_cache_objs;
> > +
> >  	/*
> >  	 * The cache follows the following algorithm
> >  	 *   1. Add the objects to the cache
> > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >  	 */
> >
> >  	/* Add elements back into the cache */
> > -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > +
> > +#if defined __ARM_NEON
> > +	for (i = 0; i < (n & ~0x1); i+=2) {
> > +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> v_base_value));
> > +		vst1_u32(cache_objs + i, v_cache_objs);
> > +	}
> > +	if (n & 0x1) {
> > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > +	}
> > +#else
> > +	for (i = 0; i < n; i++) {
> > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > +	}
> > +#endif
> >
> >  	cache->len += n;
> >
> >  	if (cache->len >= cache->flushthresh) {
> > -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> >size],
> > +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len
> -
> > +cache->size,
> >  				cache->len - cache->size);
> >  		cache->len = cache->size;
> >  	}
> > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> >  	int ret;
> > +	uint32_t i;
> >  	uint32_t index, len;
> > -	void **cache_objs;
> > +	uint32_t *cache_objs;
> >
> >  	/* No cache provided or cannot be satisfied from cache */
> >  	if (unlikely(cache == NULL || n >= cache->size))
> >  		goto ring_dequeue;
> >
> > +	void *base_value = cache->local_cache_base_value;
> >  	cache_objs = cache->objs;
> >
> >  	/* Can this be satisfied from the cache? */
> >  	if (cache->len < n) {
> >  		/* No. Backfill the cache first, and then fill from it */
> >  		uint32_t req = n + (cache->size - cache->len);
> > +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> /**< Cache objects
> > +*/
> >
> >  		/* How many do we require i.e. number to fill the cache + the
> request */
> >  		ret = rte_mempool_ops_dequeue_bulk(mp,
> > -			&cache->objs[cache->len], req);
> > +			temp_objs, req);
> >  		if (unlikely(ret < 0)) {
> >  			/*
> >  			 * In the off chance that we are buffer constrained,
> @@ -1442,12
> > +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void
> **obj_table,
> >  			goto ring_dequeue;
> >  		}
> >
> > +		len = cache->len;
> > +		for (i = 0; i < req; ++i, ++len) {
> > +			cache_objs[len] = (uint32_t)
> RTE_PTR_DIFF(temp_objs[i], base_value);
> > +		}
> > +
> >  		cache->len += req;
> >  	}
> >
> > +	uint64x2_t v_obj_table;
> > +	uint64x2_t v_cache_objs;
> > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +
> >  	/* Now fill in the response ... */
> > +#if defined __ARM_NEON
> > +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > +						len-=2, obj_table+=2) {
> > +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > +	}
> > +	if (n & 0x1)
> > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len]);
> > +#else
> >  	for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> > -		*obj_table = cache_objs[len];
> > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len]);
> > +#endif
> >
> >  	cache->len -= n;
> >
> > --
> > 2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-02  0:07   ` Honnappa Nagarahalli
@ 2021-10-02 18:51     ` Ananyev, Konstantin
  2021-10-04 16:36       ` Honnappa Nagarahalli
  0 siblings, 1 reply; 52+ messages in thread
From: Ananyev, Konstantin @ 2021-10-02 18:51 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, Ruifeng Wang, nd


> > > Current mempool per core cache implementation is based on pointer For
> > > most architectures, each pointer consumes 64b Replace it with
> > > index-based implementation, where in each buffer is addressed by (pool
> > > address + index)
> >
> > I don't think it is going to work:
> > On 64-bit systems difference between pool address and it's elem address
> > could be bigger than 4GB.
> Are you talking about a case where the memory pool size is more than 4GB?

That is one possible scenario.
Another possibility - user populates mempool himself with some external
memory by calling rte_mempool_populate_iova() directly.
I suppose such situation can even occur even with normal rte_mempool_create(),
though it should be a really rare one.  

> 
> >
> > > It will reduce memory requirements
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance and no change in throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > improvement with majority of the test cases
> > >
> > > Future plan involves replacing global pool's pointer-based
> > > implementation with index-based implementation
> > >
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > > ---
> > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > >  lib/mempool/rte_mempool.c               |  8 +++
> > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > index b1f09ff28f4d..e55913e47f21 100644
> > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > rg_flags)
> > >  		return -rte_errno;
> > >
> > >  	mp->pool_data = r;
> > > -
> > > +	mp->local_cache_base_addr = &r[1];
> > >  	return 0;
> > >  }
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 59a588425bd6..424bdb19c323 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >  	int ret;
> > >  	bool need_iova_contig_obj;
> > >  	size_t max_alloc_size = SIZE_MAX;
> > > +	unsigned lcore_id;
> > >
> > >  	ret = mempool_ops_alloc_once(mp);
> > >  	if (ret != 0)
> > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >  		}
> > >  	}
> > >
> > > +	/* Init all default caches. */
> > > +	if (mp->cache_size != 0) {
> > > +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > +			mp->local_cache[lcore_id].local_cache_base_value =
> > > +				*(void **)mp->local_cache_base_addr;
> > > +	}
> > > +
> > >  	rte_mempool_trace_populate_default(mp);
> > >  	return mp->size;
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -51,6 +51,8 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#include <arm_neon.h>
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > >  	uint32_t size;	      /**< Size of the cache */
> > >  	uint32_t flushthresh; /**< Threshold before we flush excess elements
> > */
> > >  	uint32_t len;	      /**< Current cache count */
> > > +	void *local_cache_base_value; /**< Base value to calculate indices
> > > +*/
> > >  	/*
> > >  	 * Cache is allocated to this size to allow it to overflow in certain
> > >  	 * cases to avoid needless emptying of cache.
> > >  	 */
> > > -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > objects */
> > > +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > objects */
> > >  } __rte_cache_aligned;
> > >
> > >  /**
> > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > >   * A list of memory where objects are stored
> > >   */
> > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > -
> > >  /**
> > >   * Callback used to free a memory chunk
> > >   */
> > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > >  	int32_t ops_index;
> > >
> > >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > > +	void *local_cache_base_addr; /**< Reference to the base value */
> > >
> > >  	uint32_t populated_size;         /**< Number of populated objects. */
> > >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > > pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> > >  	if (cache == NULL || cache->len == 0)
> > >  		return;
> > >  	rte_mempool_trace_cache_flush(cache, mp);
> > > -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > +
> > > +	unsigned int i;
> > > +	unsigned int cache_len = cache->len;
> > > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +	void *base_value = cache->local_cache_base_value;
> > > +	uint32_t *cache_objs = cache->objs;
> > > +	for (i = 0; i < cache_len; i++)
> > > +		obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[i]);
> > > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > >  	cache->len = 0;
> > >  }
> > >
> > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > > -	void **cache_objs;
> > > +	uint32_t *cache_objs;
> > > +	void *base_value;
> > > +	uint32_t i;
> > >
> > >  	/* increment stat now, adding in mempool always success */
> > >  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> > @@
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >
> > >  	cache_objs = &cache->objs[cache->len];
> > >
> > > +	base_value = cache->local_cache_base_value;
> > > +
> > > +	uint64x2_t v_obj_table;
> > > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +	uint32x2_t v_cache_objs;
> > > +
> > >  	/*
> > >  	 * The cache follows the following algorithm
> > >  	 *   1. Add the objects to the cache
> > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > *mp, void * const *obj_table,
> > >  	 */
> > >
> > >  	/* Add elements back into the cache */
> > > -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > +
> > > +#if defined __ARM_NEON
> > > +	for (i = 0; i < (n & ~0x1); i+=2) {
> > > +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > v_base_value));
> > > +		vst1_u32(cache_objs + i, v_cache_objs);
> > > +	}
> > > +	if (n & 0x1) {
> > > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> > base_value);
> > > +	}
> > > +#else
> > > +	for (i = 0; i < n; i++) {
> > > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> > base_value);
> > > +	}
> > > +#endif
> > >
> > >  	cache->len += n;
> > >
> > >  	if (cache->len >= cache->flushthresh) {
> > > -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> > >size],
> > > +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len
> > -
> > > +cache->size,
> > >  				cache->len - cache->size);
> > >  		cache->len = cache->size;
> > >  	}
> > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > *mp, void **obj_table,
> > >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > >  	int ret;
> > > +	uint32_t i;
> > >  	uint32_t index, len;
> > > -	void **cache_objs;
> > > +	uint32_t *cache_objs;
> > >
> > >  	/* No cache provided or cannot be satisfied from cache */
> > >  	if (unlikely(cache == NULL || n >= cache->size))
> > >  		goto ring_dequeue;
> > >
> > > +	void *base_value = cache->local_cache_base_value;
> > >  	cache_objs = cache->objs;
> > >
> > >  	/* Can this be satisfied from the cache? */
> > >  	if (cache->len < n) {
> > >  		/* No. Backfill the cache first, and then fill from it */
> > >  		uint32_t req = n + (cache->size - cache->len);
> > > +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > /**< Cache objects
> > > +*/
> > >
> > >  		/* How many do we require i.e. number to fill the cache + the
> > request */
> > >  		ret = rte_mempool_ops_dequeue_bulk(mp,
> > > -			&cache->objs[cache->len], req);
> > > +			temp_objs, req);
> > >  		if (unlikely(ret < 0)) {
> > >  			/*
> > >  			 * In the off chance that we are buffer constrained,
> > @@ -1442,12
> > > +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void
> > **obj_table,
> > >  			goto ring_dequeue;
> > >  		}
> > >
> > > +		len = cache->len;
> > > +		for (i = 0; i < req; ++i, ++len) {
> > > +			cache_objs[len] = (uint32_t)
> > RTE_PTR_DIFF(temp_objs[i], base_value);
> > > +		}
> > > +
> > >  		cache->len += req;
> > >  	}
> > >
> > > +	uint64x2_t v_obj_table;
> > > +	uint64x2_t v_cache_objs;
> > > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +
> > >  	/* Now fill in the response ... */
> > > +#if defined __ARM_NEON
> > > +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > +						len-=2, obj_table+=2) {
> > > +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > +	}
> > > +	if (n & 0x1)
> > > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[len]);
> > > +#else
> > >  	for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > obj_table++)
> > > -		*obj_table = cache_objs[len];
> > > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[len]);
> > > +#endif
> > >
> > >  	cache->len -= n;
> > >
> > > --
> > > 2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-02 18:51     ` Ananyev, Konstantin
@ 2021-10-04 16:36       ` Honnappa Nagarahalli
  2021-10-30 10:23         ` Morten Brørup
  2021-10-31  8:14         ` Morten Brørup
  0 siblings, 2 replies; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-10-04 16:36 UTC (permalink / raw)
  To: Ananyev, Konstantin, Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, Ruifeng Wang, nd, nd

<snip>
> 
> 
> > > > Current mempool per core cache implementation is based on pointer
> > > > For most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by
> > > > (pool address + index)
> > >
> > > I don't think it is going to work:
> > > On 64-bit systems difference between pool address and it's elem
> > > address could be bigger than 4GB.
> > Are you talking about a case where the memory pool size is more than 4GB?
> 
> That is one possible scenario.
> Another possibility - user populates mempool himself with some external
> memory by calling rte_mempool_populate_iova() directly.
Is the concern that IOVA might not be contiguous for all the memory used by the mempool?

> I suppose such situation can even occur even with normal
> rte_mempool_create(), though it should be a really rare one.
All in all, this feature needs to be configurable during compile time.

> 
> >
> > >
<snip>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-04 16:36       ` Honnappa Nagarahalli
@ 2021-10-30 10:23         ` Morten Brørup
  2021-10-31  8:14         ` Morten Brørup
  1 sibling, 0 replies; 52+ messages in thread
From: Morten Brørup @ 2021-10-30 10:23 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Ananyev, Konstantin, Dharmik Thakkar,
	Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, Ruifeng Wang, nd, nd

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 4 October 2021 18.36
> 
> <snip>
> >
> >
> > > > > Current mempool per core cache implementation is based on
> pointer
> > > > > For most architectures, each pointer consumes 64b Replace it
> with
> > > > > index-based implementation, where in each buffer is addressed
> by
> > > > > (pool address + index)

I like Dharmik's suggestion very much. CPU cache is a critical and limited resource.

DPDK has a tendency of using pointers where indexes could be used instead. I suppose pointers provide the additional flexibility of mixing entries from different memory pools, e.g. multiple mbuf pools.

> > > >
> > > > I don't think it is going to work:
> > > > On 64-bit systems difference between pool address and it's elem
> > > > address could be bigger than 4GB.
> > > Are you talking about a case where the memory pool size is more
> than 4GB?
> >
> > That is one possible scenario.

That could be solved by making the index an element index instead of a pointer offset: address = (pool address + index * element size).

> > Another possibility - user populates mempool himself with some
> external
> > memory by calling rte_mempool_populate_iova() directly.
> Is the concern that IOVA might not be contiguous for all the memory
> used by the mempool?
> 
> > I suppose such situation can even occur even with normal
> > rte_mempool_create(), though it should be a really rare one.
> All in all, this feature needs to be configurable during compile time.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-04 16:36       ` Honnappa Nagarahalli
  2021-10-30 10:23         ` Morten Brørup
@ 2021-10-31  8:14         ` Morten Brørup
  2021-11-03 15:12           ` Dharmik Thakkar
  1 sibling, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-10-31  8:14 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Ananyev, Konstantin, Dharmik Thakkar,
	Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, Ruifeng Wang, nd, nd

> From: Morten Brørup
> Sent: Saturday, 30 October 2021 12.24
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> > Nagarahalli
> > Sent: Monday, 4 October 2021 18.36
> >
> > <snip>
> > >
> > >
> > > > > > Current mempool per core cache implementation is based on
> > pointer
> > > > > > For most architectures, each pointer consumes 64b Replace it
> > with
> > > > > > index-based implementation, where in each buffer is addressed
> > by
> > > > > > (pool address + index)
> 
> I like Dharmik's suggestion very much. CPU cache is a critical and
> limited resource.
> 
> DPDK has a tendency of using pointers where indexes could be used
> instead. I suppose pointers provide the additional flexibility of
> mixing entries from different memory pools, e.g. multiple mbuf pools.
> 
> > > > >
> > > > > I don't think it is going to work:
> > > > > On 64-bit systems difference between pool address and it's elem
> > > > > address could be bigger than 4GB.
> > > > Are you talking about a case where the memory pool size is more
> > than 4GB?
> > >
> > > That is one possible scenario.
> 
> That could be solved by making the index an element index instead of a
> pointer offset: address = (pool address + index * element size).

Or instead of scaling the index with the element size, which is only known at runtime, the index could be more efficiently scaled by a compile time constant such as RTE_MEMPOOL_ALIGN (= RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would allow indexing into mempools up to 256 GB in size.

> 
> > > Another possibility - user populates mempool himself with some
> > external
> > > memory by calling rte_mempool_populate_iova() directly.
> > Is the concern that IOVA might not be contiguous for all the memory
> > used by the mempool?
> >
> > > I suppose such situation can even occur even with normal
> > > rte_mempool_create(), though it should be a really rare one.
> > All in all, this feature needs to be configurable during compile
> time.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-10-31  8:14         ` Morten Brørup
@ 2021-11-03 15:12           ` Dharmik Thakkar
  2021-11-03 15:52             ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2021-11-03 15:12 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Honnappa Nagarahalli, Ananyev, Konstantin, Olivier Matz,
	Andrew Rybchenko, dev, nd, Ruifeng Wang

Hi,

Thank you everyone for the comments! I am currently working on making the global pool ring’s implementation as index based.
Once done, I will send a patch for community review. I will also make it as a compile time option.

> On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: Morten Brørup
>> Sent: Saturday, 30 October 2021 12.24
>> 
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
>>> Nagarahalli
>>> Sent: Monday, 4 October 2021 18.36
>>> 
>>> <snip>
>>>> 
>>>> 
>>>>>>> Current mempool per core cache implementation is based on
>>> pointer
>>>>>>> For most architectures, each pointer consumes 64b Replace it
>>> with
>>>>>>> index-based implementation, where in each buffer is addressed
>>> by
>>>>>>> (pool address + index)
>> 
>> I like Dharmik's suggestion very much. CPU cache is a critical and
>> limited resource.
>> 
>> DPDK has a tendency of using pointers where indexes could be used
>> instead. I suppose pointers provide the additional flexibility of
>> mixing entries from different memory pools, e.g. multiple mbuf pools.
>> 

Agreed, thank you!

>>>>>> 
>>>>>> I don't think it is going to work:
>>>>>> On 64-bit systems difference between pool address and it's elem
>>>>>> address could be bigger than 4GB.
>>>>> Are you talking about a case where the memory pool size is more
>>> than 4GB?
>>>> 
>>>> That is one possible scenario.
>> 
>> That could be solved by making the index an element index instead of a
>> pointer offset: address = (pool address + index * element size).
> 
> Or instead of scaling the index with the element size, which is only known at runtime, the index could be more efficiently scaled by a compile time constant such as RTE_MEMPOOL_ALIGN (= RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would allow indexing into mempools up to 256 GB in size.
> 

Looking at this snippet [1] from rte_mempool_op_populate_helper(), there is an ‘offset’ added to avoid objects to cross page boundaries. If my understanding is correct, using the index of element instead of a pointer offset will pose a challenge for some of the corner cases.

[1]
        for (i = 0; i < max_objs; i++) {                                           
                /* avoid objects to cross page boundaries */
                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
                                off += total_elt_sz -
                                        (((uintptr_t)(va + off - 1) %
                                                total_elt_sz) + 1);
                }

>> 
>>>> Another possibility - user populates mempool himself with some
>>> external
>>>> memory by calling rte_mempool_populate_iova() directly.
>>> Is the concern that IOVA might not be contiguous for all the memory
>>> used by the mempool?
>>> 
>>>> I suppose such situation can even occur even with normal
>>>> rte_mempool_create(), though it should be a really rare one.
>>> All in all, this feature needs to be configurable during compile
>> time.
> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-03 15:12           ` Dharmik Thakkar
@ 2021-11-03 15:52             ` Morten Brørup
  2021-11-04  4:42               ` Dharmik Thakkar
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-11-03 15:52 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: Honnappa Nagarahalli, Ananyev, Konstantin, Olivier Matz,
	Andrew Rybchenko, dev, nd, Ruifeng Wang

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> Sent: Wednesday, 3 November 2021 16.13
> 
> Hi,
> 
> Thank you everyone for the comments! I am currently working on making
> the global pool ring’s implementation as index based.
> Once done, I will send a patch for community review. I will also make
> it as a compile time option.

Sounds good to me.

This could probably be abstracted to other libraries too. E.g. the ring library holds pointers to objects (void *); an alternative ring library could hold indexes to objects (uint32_t). A ring often holds objects from the same mempool, and the application knows which mempool, so indexing would be useful here too.

> 
> > On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: Morten Brørup
> >> Sent: Saturday, 30 October 2021 12.24
> >>
> >>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> >>> Nagarahalli
> >>> Sent: Monday, 4 October 2021 18.36
> >>>
> >>> <snip>
> >>>>
> >>>>
> >>>>>>> Current mempool per core cache implementation is based on
> >>> pointer
> >>>>>>> For most architectures, each pointer consumes 64b Replace it
> >>> with
> >>>>>>> index-based implementation, where in each buffer is addressed
> >>> by
> >>>>>>> (pool address + index)
> >>
> >> I like Dharmik's suggestion very much. CPU cache is a critical and
> >> limited resource.
> >>
> >> DPDK has a tendency of using pointers where indexes could be used
> >> instead. I suppose pointers provide the additional flexibility of
> >> mixing entries from different memory pools, e.g. multiple mbuf
> pools.
> >>
> 
> Agreed, thank you!
> 
> >>>>>>
> >>>>>> I don't think it is going to work:
> >>>>>> On 64-bit systems difference between pool address and it's elem
> >>>>>> address could be bigger than 4GB.
> >>>>> Are you talking about a case where the memory pool size is more
> >>> than 4GB?
> >>>>
> >>>> That is one possible scenario.
> >>
> >> That could be solved by making the index an element index instead of
> a
> >> pointer offset: address = (pool address + index * element size).
> >
> > Or instead of scaling the index with the element size, which is only
> known at runtime, the index could be more efficiently scaled by a
> compile time constant such as RTE_MEMPOOL_ALIGN (=
> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> allow indexing into mempools up to 256 GB in size.
> >
> 
> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> there is an ‘offset’ added to avoid objects to cross page boundaries.
> If my understanding is correct, using the index of element instead of a
> pointer offset will pose a challenge for some of the corner cases.
> 
> [1]
>         for (i = 0; i < max_objs; i++) {
>                 /* avoid objects to cross page boundaries */
>                 if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
> 0) {
>                         off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> (va + off);
>                         if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
>                                 off += total_elt_sz -
>                                         (((uintptr_t)(va + off - 1) %
>                                                 total_elt_sz) + 1);
>                 }
> 

OK. Alternatively to scaling the index with a cache line size, you can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB mempools on respectively 64 bit and 32 bit architectures. Both x86 and ARM CPUs have instructions to access memory with an added offset multiplied by 4 or 8. So that should be high performance.

> >>
> >>>> Another possibility - user populates mempool himself with some
> >>> external
> >>>> memory by calling rte_mempool_populate_iova() directly.
> >>> Is the concern that IOVA might not be contiguous for all the memory
> >>> used by the mempool?
> >>>
> >>>> I suppose such situation can even occur even with normal
> >>>> rte_mempool_create(), though it should be a really rare one.
> >>> All in all, this feature needs to be configurable during compile
> >> time.
> >


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-03 15:52             ` Morten Brørup
@ 2021-11-04  4:42               ` Dharmik Thakkar
  2021-11-04  8:04                 ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2021-11-04  4:42 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Honnappa Nagarahalli, Ananyev, Konstantin, Olivier Matz,
	Andrew Rybchenko, dev, nd, Ruifeng Wang



> On Nov 3, 2021, at 10:52 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
>> Sent: Wednesday, 3 November 2021 16.13
>> 
>> Hi,
>> 
>> Thank you everyone for the comments! I am currently working on making
>> the global pool ring’s implementation as index based.
>> Once done, I will send a patch for community review. I will also make
>> it as a compile time option.
> 
> Sounds good to me.
> 
> This could probably be abstracted to other libraries too. E.g. the ring library holds pointers to objects (void *); an alternative ring library could hold indexes to objects (uint32_t). A ring often holds objects from the same mempool, and the application knows which mempool, so indexing would be useful here too.
> 

Yes, ring library within DPDK has the APIs to support configurable element size

>> 
>>> On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com>
>> wrote:
>>> 
>>>> From: Morten Brørup
>>>> Sent: Saturday, 30 October 2021 12.24
>>>> 
>>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
>>>>> Nagarahalli
>>>>> Sent: Monday, 4 October 2021 18.36
>>>>> 
>>>>> <snip>
>>>>>> 
>>>>>> 
>>>>>>>>> Current mempool per core cache implementation is based on
>>>>> pointer
>>>>>>>>> For most architectures, each pointer consumes 64b Replace it
>>>>> with
>>>>>>>>> index-based implementation, where in each buffer is addressed
>>>>> by
>>>>>>>>> (pool address + index)
>>>> 
>>>> I like Dharmik's suggestion very much. CPU cache is a critical and
>>>> limited resource.
>>>> 
>>>> DPDK has a tendency of using pointers where indexes could be used
>>>> instead. I suppose pointers provide the additional flexibility of
>>>> mixing entries from different memory pools, e.g. multiple mbuf
>> pools.
>>>> 
>> 
>> Agreed, thank you!
>> 
>>>>>>>> 
>>>>>>>> I don't think it is going to work:
>>>>>>>> On 64-bit systems difference between pool address and it's elem
>>>>>>>> address could be bigger than 4GB.
>>>>>>> Are you talking about a case where the memory pool size is more
>>>>> than 4GB?
>>>>>> 
>>>>>> That is one possible scenario.
>>>> 
>>>> That could be solved by making the index an element index instead of
>> a
>>>> pointer offset: address = (pool address + index * element size).
>>> 
>>> Or instead of scaling the index with the element size, which is only
>> known at runtime, the index could be more efficiently scaled by a
>> compile time constant such as RTE_MEMPOOL_ALIGN (=
>> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
>> allow indexing into mempools up to 256 GB in size.
>>> 
>> 
>> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
>> there is an ‘offset’ added to avoid objects to cross page boundaries.
>> If my understanding is correct, using the index of element instead of a
>> pointer offset will pose a challenge for some of the corner cases.
>> 
>> [1]
>>        for (i = 0; i < max_objs; i++) {
>>                /* avoid objects to cross page boundaries */
>>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
>> 0) {
>>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
>> (va + off);
>>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
>>                                off += total_elt_sz -
>>                                        (((uintptr_t)(va + off - 1) %
>>                                                total_elt_sz) + 1);
>>                }
>> 
> 
> OK. Alternatively to scaling the index with a cache line size, you can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB mempools on respectively 64 bit and 32 bit architectures. Both x86 and ARM CPUs have instructions to access memory with an added offset multiplied by 4 or 8. So that should be high performance.

Yes, agreed this can be done.
Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is not enabled.
On a side note, I wanted to better understand the need for having the 'MEMPOOL_F_NO_CACHE_ALIGN' option.

> 
>>>> 
>>>>>> Another possibility - user populates mempool himself with some
>>>>> external
>>>>>> memory by calling rte_mempool_populate_iova() directly.
>>>>> Is the concern that IOVA might not be contiguous for all the memory
>>>>> used by the mempool?
>>>>> 
>>>>>> I suppose such situation can even occur even with normal
>>>>>> rte_mempool_create(), though it should be a really rare one.
>>>>> All in all, this feature needs to be configurable during compile
>>>> time.
>>> 
> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-04  4:42               ` Dharmik Thakkar
@ 2021-11-04  8:04                 ` Morten Brørup
  2021-11-08  4:32                   ` Honnappa Nagarahalli
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-11-04  8:04 UTC (permalink / raw)
  To: Dharmik Thakkar, Honnappa Nagarahalli, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang

+ Ring library maintainers (@Honnappa and @Konstantin) for my rants about its documentation.

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> Sent: Thursday, 4 November 2021 05.42
> 
> > On Nov 3, 2021, at 10:52 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> >> Sent: Wednesday, 3 November 2021 16.13
> >>
> >> Hi,
> >>
> >> Thank you everyone for the comments! I am currently working on
> making
> >> the global pool ring’s implementation as index based.
> >> Once done, I will send a patch for community review. I will also
> make
> >> it as a compile time option.
> >
> > Sounds good to me.
> >
> > This could probably be abstracted to other libraries too. E.g. the
> ring library holds pointers to objects (void *); an alternative ring
> library could hold indexes to objects (uint32_t). A ring often holds
> objects from the same mempool, and the application knows which mempool,
> so indexing would be useful here too.
> >
> 
> Yes, ring library within DPDK has the APIs to support configurable
> element size

I remember seeing that feature proposed on the mailing list too, but I couldn't find it in the API documentation, so I was not sure it was ever accepted.

The containers section of the API documentation (/doc/api/doxy-api-index.md) doesn't contain any references to it. And the description of the RTE Ring library, which the "ring" link in the API documentation refers to, clearly says: The Ring Manager is a fixed-size queue, implemented as a table of *pointers*. (My emphasis.) So I thought it wasn't accepted.

However, searching for it in the source code reveals that it is indeed there! And the Ring Library chapter in the Programmer's Guide does mention that the objects can be something else than pointers.

So the documentation is not all screwed up, just a little sparse. :-)

> 
> >>
> >>> On Oct 31, 2021, at 3:14 AM, Morten Brørup
> <mb@smartsharesystems.com>
> >> wrote:
> >>>
> >>>> From: Morten Brørup
> >>>> Sent: Saturday, 30 October 2021 12.24
> >>>>
> >>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> >>>>> Nagarahalli
> >>>>> Sent: Monday, 4 October 2021 18.36
> >>>>>
> >>>>> <snip>
> >>>>>>
> >>>>>>
> >>>>>>>>> Current mempool per core cache implementation is based on
> >>>>> pointer
> >>>>>>>>> For most architectures, each pointer consumes 64b Replace it
> >>>>> with
> >>>>>>>>> index-based implementation, where in each buffer is addressed
> >>>>> by
> >>>>>>>>> (pool address + index)
> >>>>
> >>>> I like Dharmik's suggestion very much. CPU cache is a critical and
> >>>> limited resource.
> >>>>
> >>>> DPDK has a tendency of using pointers where indexes could be used
> >>>> instead. I suppose pointers provide the additional flexibility of
> >>>> mixing entries from different memory pools, e.g. multiple mbuf
> >> pools.
> >>>>
> >>
> >> Agreed, thank you!
> >>
> >>>>>>>>
> >>>>>>>> I don't think it is going to work:
> >>>>>>>> On 64-bit systems difference between pool address and it's
> elem
> >>>>>>>> address could be bigger than 4GB.
> >>>>>>> Are you talking about a case where the memory pool size is more
> >>>>> than 4GB?
> >>>>>>
> >>>>>> That is one possible scenario.
> >>>>
> >>>> That could be solved by making the index an element index instead
> of
> >> a
> >>>> pointer offset: address = (pool address + index * element size).
> >>>
> >>> Or instead of scaling the index with the element size, which is
> only
> >> known at runtime, the index could be more efficiently scaled by a
> >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> >> allow indexing into mempools up to 256 GB in size.
> >>>
> >>
> >> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> >> there is an ‘offset’ added to avoid objects to cross page
> boundaries.
> >> If my understanding is correct, using the index of element instead
> of a
> >> pointer offset will pose a challenge for some of the corner cases.
> >>
> >> [1]
> >>        for (i = 0; i < max_objs; i++) {
> >>                /* avoid objects to cross page boundaries */
> >>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
> >> 0) {
> >>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> >> (va + off);
> >>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> >>                                off += total_elt_sz -
> >>                                        (((uintptr_t)(va + off - 1) %
> >>                                                total_elt_sz) + 1);
> >>                }
> >>
> >
> > OK. Alternatively to scaling the index with a cache line size, you
> can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB
> mempools on respectively 64 bit and 32 bit architectures. Both x86 and
> ARM CPUs have instructions to access memory with an added offset
> multiplied by 4 or 8. So that should be high performance.
> 
> Yes, agreed this can be done.
> Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is not
> enabled.
> On a side note, I wanted to better understand the need for having the
> 'MEMPOOL_F_NO_CACHE_ALIGN' option.

The description of this field is misleading, and should be corrected.
The correct description would be: Don't need to align objs on cache lines.

It is useful for mempools containing very small objects, to conserve memory.

> 
> >
> >>>>
> >>>>>> Another possibility - user populates mempool himself with some
> >>>>> external
> >>>>>> memory by calling rte_mempool_populate_iova() directly.
> >>>>> Is the concern that IOVA might not be contiguous for all the
> memory
> >>>>> used by the mempool?
> >>>>>
> >>>>>> I suppose such situation can even occur even with normal
> >>>>>> rte_mempool_create(), though it should be a really rare one.
> >>>>> All in all, this feature needs to be configurable during compile
> >>>> time.
> >>>
> >


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-04  8:04                 ` Morten Brørup
@ 2021-11-08  4:32                   ` Honnappa Nagarahalli
  2021-11-08  7:22                     ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-11-08  4:32 UTC (permalink / raw)
  To: Morten Brørup, Dharmik Thakkar, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang, nd

<snip>

> > >>>>>>>>> Current mempool per core cache implementation is based on
> > >>>>> pointer
> > >>>>>>>>> For most architectures, each pointer consumes 64b Replace it
> > >>>>> with
> > >>>>>>>>> index-based implementation, where in each buffer is
> > >>>>>>>>> addressed
> > >>>>> by
> > >>>>>>>>> (pool address + index)
> > >>>>
> > >>>> I like Dharmik's suggestion very much. CPU cache is a critical
> > >>>> and limited resource.
> > >>>>
> > >>>> DPDK has a tendency of using pointers where indexes could be used
> > >>>> instead. I suppose pointers provide the additional flexibility of
> > >>>> mixing entries from different memory pools, e.g. multiple mbuf
> > >> pools.
> > >>>>
> > >>
> > >> Agreed, thank you!
> > >>
> > >>>>>>>>
> > >>>>>>>> I don't think it is going to work:
> > >>>>>>>> On 64-bit systems difference between pool address and it's
> > elem
> > >>>>>>>> address could be bigger than 4GB.
> > >>>>>>> Are you talking about a case where the memory pool size is
> > >>>>>>> more
> > >>>>> than 4GB?
> > >>>>>>
> > >>>>>> That is one possible scenario.
> > >>>>
> > >>>> That could be solved by making the index an element index instead
> > of
> > >> a
> > >>>> pointer offset: address = (pool address + index * element size).
> > >>>
> > >>> Or instead of scaling the index with the element size, which is
> > only
> > >> known at runtime, the index could be more efficiently scaled by a
> > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> > >> allow indexing into mempools up to 256 GB in size.
> > >>>
> > >>
> > >> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> > >> there is an ‘offset’ added to avoid objects to cross page
> > boundaries.
> > >> If my understanding is correct, using the index of element instead
> > of a
> > >> pointer offset will pose a challenge for some of the corner cases.
> > >>
> > >> [1]
> > >>        for (i = 0; i < max_objs; i++) {
> > >>                /* avoid objects to cross page boundaries */
> > >>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz)
> > >> <
> > >> 0) {
> > >>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> > >> (va + off);
> > >>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > >>                                off += total_elt_sz -
> > >>                                        (((uintptr_t)(va + off - 1) %
> > >>                                                total_elt_sz) + 1);
> > >>                }
> > >>
> > >
> > > OK. Alternatively to scaling the index with a cache line size, you
> > can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB
> > mempools on respectively 64 bit and 32 bit architectures. Both x86 and
> > ARM CPUs have instructions to access memory with an added offset
> > multiplied by 4 or 8. So that should be high performance.
> >
> > Yes, agreed this can be done.
> > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is
> > not enabled.
> > On a side note, I wanted to better understand the need for having the
> > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> 
> The description of this field is misleading, and should be corrected.
> The correct description would be: Don't need to align objs on cache lines.
> 
> It is useful for mempools containing very small objects, to conserve memory.
I think we can assume that mbuf pools are created with the 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset calculated with cache line size as the unit.

> 
> >
> > >
> > >>>>
> > >>>>>> Another possibility - user populates mempool himself with some
> > >>>>> external
> > >>>>>> memory by calling rte_mempool_populate_iova() directly.
> > >>>>> Is the concern that IOVA might not be contiguous for all the
> > memory
> > >>>>> used by the mempool?
> > >>>>>
> > >>>>>> I suppose such situation can even occur even with normal
> > >>>>>> rte_mempool_create(), though it should be a really rare one.
> > >>>>> All in all, this feature needs to be configurable during compile
> > >>>> time.
> > >>>
> > >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-08  4:32                   ` Honnappa Nagarahalli
@ 2021-11-08  7:22                     ` Morten Brørup
  2021-11-08 15:29                       ` Honnappa Nagarahalli
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-11-08  7:22 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Dharmik Thakkar, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang, nd

> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Monday, 8 November 2021 05.33
> 
> <snip>
> 
> > > >>>>>>>>> Current mempool per core cache implementation is based on
> > > >>>>> pointer
> > > >>>>>>>>> For most architectures, each pointer consumes 64b Replace
> it
> > > >>>>> with
> > > >>>>>>>>> index-based implementation, where in each buffer is
> > > >>>>>>>>> addressed
> > > >>>>> by
> > > >>>>>>>>> (pool address + index)
> > > >>>>
> > > >>>> I like Dharmik's suggestion very much. CPU cache is a critical
> > > >>>> and limited resource.
> > > >>>>
> > > >>>> DPDK has a tendency of using pointers where indexes could be
> used
> > > >>>> instead. I suppose pointers provide the additional flexibility
> of
> > > >>>> mixing entries from different memory pools, e.g. multiple mbuf
> > > >> pools.
> > > >>>>
> > > >>
> > > >> Agreed, thank you!
> > > >>
> > > >>>>>>>>
> > > >>>>>>>> I don't think it is going to work:
> > > >>>>>>>> On 64-bit systems difference between pool address and it's
> > > elem
> > > >>>>>>>> address could be bigger than 4GB.
> > > >>>>>>> Are you talking about a case where the memory pool size is
> > > >>>>>>> more
> > > >>>>> than 4GB?
> > > >>>>>>
> > > >>>>>> That is one possible scenario.
> > > >>>>
> > > >>>> That could be solved by making the index an element index
> instead
> > > of
> > > >> a
> > > >>>> pointer offset: address = (pool address + index * element
> size).
> > > >>>
> > > >>> Or instead of scaling the index with the element size, which is
> > > only
> > > >> known at runtime, the index could be more efficiently scaled by
> a
> > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that
> would
> > > >> allow indexing into mempools up to 256 GB in size.
> > > >>>
> > > >>
> > > >> Looking at this snippet [1] from
> rte_mempool_op_populate_helper(),
> > > >> there is an ‘offset’ added to avoid objects to cross page
> > > boundaries.
> > > >> If my understanding is correct, using the index of element
> instead
> > > of a
> > > >> pointer offset will pose a challenge for some of the corner
> cases.
> > > >>
> > > >> [1]
> > > >>        for (i = 0; i < max_objs; i++) {
> > > >>                /* avoid objects to cross page boundaries */
> > > >>                if (check_obj_bounds(va + off, pg_sz,
> total_elt_sz)
> > > >> <
> > > >> 0) {
> > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> pg_sz) -
> > > >> (va + off);
> > > >>                        if (flags &
> RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > >>                                off += total_elt_sz -
> > > >>                                        (((uintptr_t)(va + off -
> 1) %
> > > >>                                                total_elt_sz) +
> 1);
> > > >>                }
> > > >>
> > > >
> > > > OK. Alternatively to scaling the index with a cache line size,
> you
> > > can scale it with sizeof(uintptr_t) to be able to address 32 or 16
> GB
> > > mempools on respectively 64 bit and 32 bit architectures. Both x86
> and
> > > ARM CPUs have instructions to access memory with an added offset
> > > multiplied by 4 or 8. So that should be high performance.
> > >
> > > Yes, agreed this can be done.
> > > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is
> > > not enabled.
> > > On a side note, I wanted to better understand the need for having
> the
> > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> >
> > The description of this field is misleading, and should be corrected.
> > The correct description would be: Don't need to align objs on cache
> lines.
> >
> > It is useful for mempools containing very small objects, to conserve
> memory.
> I think we can assume that mbuf pools are created with the
> 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> calculated with cache line size as the unit.

You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)

I agree. And since the flag is a hint only, it can be ignored if the mempool library is scaling the index with the cache line size.

However, a mempool may contain other objects than mbufs, and those objects may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost a lot of memory for such mempools.

> 
> >
> > >
> > > >
> > > >>>>
> > > >>>>>> Another possibility - user populates mempool himself with
> some
> > > >>>>> external
> > > >>>>>> memory by calling rte_mempool_populate_iova() directly.
> > > >>>>> Is the concern that IOVA might not be contiguous for all the
> > > memory
> > > >>>>> used by the mempool?
> > > >>>>>
> > > >>>>>> I suppose such situation can even occur even with normal
> > > >>>>>> rte_mempool_create(), though it should be a really rare one.
> > > >>>>> All in all, this feature needs to be configurable during
> compile
> > > >>>> time.
> > > >>>
> > > >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-08  7:22                     ` Morten Brørup
@ 2021-11-08 15:29                       ` Honnappa Nagarahalli
  2021-11-08 15:39                         ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-11-08 15:29 UTC (permalink / raw)
  To: Morten Brørup, Dharmik Thakkar, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang, nd

<snip>

> > > > >>>>>>>>> Current mempool per core cache implementation is based
> > > > >>>>>>>>> on
> > > > >>>>> pointer
> > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > >>>>>>>>> Replace
> > it
> > > > >>>>> with
> > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > >>>>>>>>> addressed
> > > > >>>>> by
> > > > >>>>>>>>> (pool address + index)
> > > > >>>>
> > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > >>>> critical and limited resource.
> > > > >>>>
> > > > >>>> DPDK has a tendency of using pointers where indexes could be
> > used
> > > > >>>> instead. I suppose pointers provide the additional
> > > > >>>> flexibility
> > of
> > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > >>>> mbuf
> > > > >> pools.
> > > > >>>>
> > > > >>
> > > > >> Agreed, thank you!
> > > > >>
> > > > >>>>>>>>
> > > > >>>>>>>> I don't think it is going to work:
> > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > >>>>>>>> it's
> > > > elem
> > > > >>>>>>>> address could be bigger than 4GB.
> > > > >>>>>>> Are you talking about a case where the memory pool size is
> > > > >>>>>>> more
> > > > >>>>> than 4GB?
> > > > >>>>>>
> > > > >>>>>> That is one possible scenario.
> > > > >>>>
> > > > >>>> That could be solved by making the index an element index
> > instead
> > > > of
> > > > >> a
> > > > >>>> pointer offset: address = (pool address + index * element
> > size).
> > > > >>>
> > > > >>> Or instead of scaling the index with the element size, which
> > > > >>> is
> > > > only
> > > > >> known at runtime, the index could be more efficiently scaled by
> > a
> > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that
> > would
> > > > >> allow indexing into mempools up to 256 GB in size.
> > > > >>>
> > > > >>
> > > > >> Looking at this snippet [1] from
> > rte_mempool_op_populate_helper(),
> > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > boundaries.
> > > > >> If my understanding is correct, using the index of element
> > instead
> > > > of a
> > > > >> pointer offset will pose a challenge for some of the corner
> > cases.
> > > > >>
> > > > >> [1]
> > > > >>        for (i = 0; i < max_objs; i++) {
> > > > >>                /* avoid objects to cross page boundaries */
> > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > total_elt_sz)
> > > > >> <
> > > > >> 0) {
> > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > pg_sz) -
> > > > >> (va + off);
> > > > >>                        if (flags &
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > >>                                off += total_elt_sz -
> > > > >>                                        (((uintptr_t)(va + off -
> > 1) %
> > > > >>                                                total_elt_sz) +
> > 1);
> > > > >>                }
> > > > >>
> > > > >
> > > > > OK. Alternatively to scaling the index with a cache line size,
> > you
> > > > can scale it with sizeof(uintptr_t) to be able to address 32 or 16
> > GB
> > > > mempools on respectively 64 bit and 32 bit architectures. Both x86
> > and
> > > > ARM CPUs have instructions to access memory with an added offset
> > > > multiplied by 4 or 8. So that should be high performance.
> > > >
> > > > Yes, agreed this can be done.
> > > > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > is not enabled.
> > > > On a side note, I wanted to better understand the need for having
> > the
> > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > >
> > > The description of this field is misleading, and should be corrected.
> > > The correct description would be: Don't need to align objs on cache
> > lines.
> > >
> > > It is useful for mempools containing very small objects, to conserve
> > memory.
> > I think we can assume that mbuf pools are created with the
> > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > calculated with cache line size as the unit.
> 
> You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
Yes 😊

> 
> I agree. And since the flag is a hint only, it can be ignored if the mempool
> library is scaling the index with the cache line size.
I do not think we should ignore the flag for reason you mention below.

> 
> However, a mempool may contain other objects than mbufs, and those objects
> may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost a
> lot of memory for such mempools.
We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set, use the unit as 'sizeof(uintptr_t)', if not set use cache line size as the unit.

> 
> >
> > >
<snip>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-08 15:29                       ` Honnappa Nagarahalli
@ 2021-11-08 15:39                         ` Morten Brørup
  2021-11-08 15:46                           ` Honnappa Nagarahalli
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-11-08 15:39 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Dharmik Thakkar, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang, nd

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 8 November 2021 16.29
> 
> <snip>
> 
> > > > > >>>>>>>>> Current mempool per core cache implementation is
> based
> > > > > >>>>>>>>> on
> > > > > >>>>> pointer
> > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > >>>>>>>>> Replace
> > > it
> > > > > >>>>> with
> > > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > > >>>>>>>>> addressed
> > > > > >>>>> by
> > > > > >>>>>>>>> (pool address + index)
> > > > > >>>>
> > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > >>>> critical and limited resource.
> > > > > >>>>
> > > > > >>>> DPDK has a tendency of using pointers where indexes could
> be
> > > used
> > > > > >>>> instead. I suppose pointers provide the additional
> > > > > >>>> flexibility
> > > of
> > > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > > >>>> mbuf
> > > > > >> pools.
> > > > > >>>>
> > > > > >>
> > > > > >> Agreed, thank you!
> > > > > >>
> > > > > >>>>>>>>
> > > > > >>>>>>>> I don't think it is going to work:
> > > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > > >>>>>>>> it's
> > > > > elem
> > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > >>>>>>> Are you talking about a case where the memory pool size
> is
> > > > > >>>>>>> more
> > > > > >>>>> than 4GB?
> > > > > >>>>>>
> > > > > >>>>>> That is one possible scenario.
> > > > > >>>>
> > > > > >>>> That could be solved by making the index an element index
> > > instead
> > > > > of
> > > > > >> a
> > > > > >>>> pointer offset: address = (pool address + index * element
> > > size).
> > > > > >>>
> > > > > >>> Or instead of scaling the index with the element size,
> which
> > > > > >>> is
> > > > > only
> > > > > >> known at runtime, the index could be more efficiently scaled
> by
> > > a
> > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> that
> > > would
> > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > >>>
> > > > > >>
> > > > > >> Looking at this snippet [1] from
> > > rte_mempool_op_populate_helper(),
> > > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > > boundaries.
> > > > > >> If my understanding is correct, using the index of element
> > > instead
> > > > > of a
> > > > > >> pointer offset will pose a challenge for some of the corner
> > > cases.
> > > > > >>
> > > > > >> [1]
> > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > >>                /* avoid objects to cross page boundaries */
> > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > total_elt_sz)
> > > > > >> <
> > > > > >> 0) {
> > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > > pg_sz) -
> > > > > >> (va + off);
> > > > > >>                        if (flags &
> > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > >>                                off += total_elt_sz -
> > > > > >>                                        (((uintptr_t)(va +
> off -
> > > 1) %
> > > > > >>                                                total_elt_sz)
> +
> > > 1);
> > > > > >>                }
> > > > > >>
> > > > > >
> > > > > > OK. Alternatively to scaling the index with a cache line
> size,
> > > you
> > > > > can scale it with sizeof(uintptr_t) to be able to address 32 or
> 16
> > > GB
> > > > > mempools on respectively 64 bit and 32 bit architectures. Both
> x86
> > > and
> > > > > ARM CPUs have instructions to access memory with an added
> offset
> > > > > multiplied by 4 or 8. So that should be high performance.
> > > > >
> > > > > Yes, agreed this can be done.
> > > > > Cache line size can also be used when
> ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > is not enabled.
> > > > > On a side note, I wanted to better understand the need for
> having
> > > the
> > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > >
> > > > The description of this field is misleading, and should be
> corrected.
> > > > The correct description would be: Don't need to align objs on
> cache
> > > lines.
> > > >
> > > > It is useful for mempools containing very small objects, to
> conserve
> > > memory.
> > > I think we can assume that mbuf pools are created with the
> > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > > calculated with cache line size as the unit.
> >
> > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> Yes 😊
> 
> >
> > I agree. And since the flag is a hint only, it can be ignored if the
> mempool
> > library is scaling the index with the cache line size.
> I do not think we should ignore the flag for reason you mention below.
> 
> >
> > However, a mempool may contain other objects than mbufs, and those
> objects
> > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost
> a
> > lot of memory for such mempools.
> We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set, use
> the unit as 'sizeof(uintptr_t)', if not set use cache line size as the
> unit.
> 

That would require that the indexing multiplier is a runtime parameter instead of a compile time parameter. So it would have a performance penalty.

The indexing multiplier could be compile time configurable, so it is a tradeoff between granularity and maximum mempool size.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-08 15:39                         ` Morten Brørup
@ 2021-11-08 15:46                           ` Honnappa Nagarahalli
  2021-11-08 16:03                             ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Honnappa Nagarahalli @ 2021-11-08 15:46 UTC (permalink / raw)
  To: Morten Brørup, Dharmik Thakkar, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang, nd, nd

<snip>
> >
> > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > based
> > > > > > >>>>>>>>> on
> > > > > > >>>>> pointer
> > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > >>>>>>>>> Replace
> > > > it
> > > > > > >>>>> with
> > > > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > > > >>>>>>>>> addressed
> > > > > > >>>>> by
> > > > > > >>>>>>>>> (pool address + index)
> > > > > > >>>>
> > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > >>>> critical and limited resource.
> > > > > > >>>>
> > > > > > >>>> DPDK has a tendency of using pointers where indexes could
> > be
> > > > used
> > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > >>>> flexibility
> > > > of
> > > > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > > > >>>> mbuf
> > > > > > >> pools.
> > > > > > >>>>
> > > > > > >>
> > > > > > >> Agreed, thank you!
> > > > > > >>
> > > > > > >>>>>>>>
> > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > > > >>>>>>>> it's
> > > > > > elem
> > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > >>>>>>> size
> > is
> > > > > > >>>>>>> more
> > > > > > >>>>> than 4GB?
> > > > > > >>>>>>
> > > > > > >>>>>> That is one possible scenario.
> > > > > > >>>>
> > > > > > >>>> That could be solved by making the index an element index
> > > > instead
> > > > > > of
> > > > > > >> a
> > > > > > >>>> pointer offset: address = (pool address + index * element
> > > > size).
> > > > > > >>>
> > > > > > >>> Or instead of scaling the index with the element size,
> > which
> > > > > > >>> is
> > > > > > only
> > > > > > >> known at runtime, the index could be more efficiently
> > > > > > >> scaled
> > by
> > > > a
> > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > that
> > > > would
> > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > >>>
> > > > > > >>
> > > > > > >> Looking at this snippet [1] from
> > > > rte_mempool_op_populate_helper(),
> > > > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > > > boundaries.
> > > > > > >> If my understanding is correct, using the index of element
> > > > instead
> > > > > > of a
> > > > > > >> pointer offset will pose a challenge for some of the corner
> > > > cases.
> > > > > > >>
> > > > > > >> [1]
> > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > >>                /* avoid objects to cross page boundaries */
> > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > total_elt_sz)
> > > > > > >> <
> > > > > > >> 0) {
> > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > > > pg_sz) -
> > > > > > >> (va + off);
> > > > > > >>                        if (flags &
> > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > >>                                off += total_elt_sz -
> > > > > > >>                                        (((uintptr_t)(va +
> > off -
> > > > 1) %
> > > > > > >>
> > > > > > >> total_elt_sz)
> > +
> > > > 1);
> > > > > > >>                }
> > > > > > >>
> > > > > > >
> > > > > > > OK. Alternatively to scaling the index with a cache line
> > size,
> > > > you
> > > > > > can scale it with sizeof(uintptr_t) to be able to address 32
> > > > > > or
> > 16
> > > > GB
> > > > > > mempools on respectively 64 bit and 32 bit architectures. Both
> > x86
> > > > and
> > > > > > ARM CPUs have instructions to access memory with an added
> > offset
> > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > >
> > > > > > Yes, agreed this can be done.
> > > > > > Cache line size can also be used when
> > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > is not enabled.
> > > > > > On a side note, I wanted to better understand the need for
> > having
> > > > the
> > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > >
> > > > > The description of this field is misleading, and should be
> > corrected.
> > > > > The correct description would be: Don't need to align objs on
> > cache
> > > > lines.
> > > > >
> > > > > It is useful for mempools containing very small objects, to
> > conserve
> > > > memory.
> > > > I think we can assume that mbuf pools are created with the
> > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > > > calculated with cache line size as the unit.
> > >
> > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > Yes 😊
> >
> > >
> > > I agree. And since the flag is a hint only, it can be ignored if the
> > mempool
> > > library is scaling the index with the cache line size.
> > I do not think we should ignore the flag for reason you mention below.
> >
> > >
> > > However, a mempool may contain other objects than mbufs, and those
> > objects
> > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> cost
> > a
> > > lot of memory for such mempools.
> > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > use the unit as 'sizeof(uintptr_t)', if not set use cache line size as
> > the unit.
> >
> 
> That would require that the indexing multiplier is a runtime parameter instead
> of a compile time parameter. So it would have a performance penalty.
> 
> The indexing multiplier could be compile time configurable, so it is a tradeoff
> between granularity and maximum mempool size.
I meant compile time configurable. i.e.

#ifdef MEMPOOL_F_NO_CACHE_ALIGN
<use sizeof(uintptr_t) as the multiplier>
#else
<use cache line size as the multiplier> /* This should provide enough memory for packet buffers */
#endif

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-08 15:46                           ` Honnappa Nagarahalli
@ 2021-11-08 16:03                             ` Morten Brørup
  2021-11-08 16:47                               ` Jerin Jacob
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-11-08 16:03 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Dharmik Thakkar, Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Ruifeng Wang, nd, nd

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 8 November 2021 16.46
> 
> <snip>
> > >
> > > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > > based
> > > > > > > >>>>>>>>> on
> > > > > > > >>>>> pointer
> > > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > > >>>>>>>>> Replace
> > > > > it
> > > > > > > >>>>> with
> > > > > > > >>>>>>>>> index-based implementation, where in each buffer
> is
> > > > > > > >>>>>>>>> addressed
> > > > > > > >>>>> by
> > > > > > > >>>>>>>>> (pool address + index)
> > > > > > > >>>>
> > > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > > >>>> critical and limited resource.
> > > > > > > >>>>
> > > > > > > >>>> DPDK has a tendency of using pointers where indexes
> could
> > > be
> > > > > used
> > > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > > >>>> flexibility
> > > > > of
> > > > > > > >>>> mixing entries from different memory pools, e.g.
> multiple
> > > > > > > >>>> mbuf
> > > > > > > >> pools.
> > > > > > > >>>>
> > > > > > > >>
> > > > > > > >> Agreed, thank you!
> > > > > > > >>
> > > > > > > >>>>>>>>
> > > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > > >>>>>>>> On 64-bit systems difference between pool address
> and
> > > > > > > >>>>>>>> it's
> > > > > > > elem
> > > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > > >>>>>>> size
> > > is
> > > > > > > >>>>>>> more
> > > > > > > >>>>> than 4GB?
> > > > > > > >>>>>>
> > > > > > > >>>>>> That is one possible scenario.
> > > > > > > >>>>
> > > > > > > >>>> That could be solved by making the index an element
> index
> > > > > instead
> > > > > > > of
> > > > > > > >> a
> > > > > > > >>>> pointer offset: address = (pool address + index *
> element
> > > > > size).
> > > > > > > >>>
> > > > > > > >>> Or instead of scaling the index with the element size,
> > > which
> > > > > > > >>> is
> > > > > > > only
> > > > > > > >> known at runtime, the index could be more efficiently
> > > > > > > >> scaled
> > > by
> > > > > a
> > > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > > that
> > > > > would
> > > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > > >>>
> > > > > > > >>
> > > > > > > >> Looking at this snippet [1] from
> > > > > rte_mempool_op_populate_helper(),
> > > > > > > >> there is an ‘offset’ added to avoid objects to cross
> page
> > > > > > > boundaries.
> > > > > > > >> If my understanding is correct, using the index of
> element
> > > > > instead
> > > > > > > of a
> > > > > > > >> pointer offset will pose a challenge for some of the
> corner
> > > > > cases.
> > > > > > > >>
> > > > > > > >> [1]
> > > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > > >>                /* avoid objects to cross page boundaries
> */
> > > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > > total_elt_sz)
> > > > > > > >> <
> > > > > > > >> 0) {
> > > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va +
> off,
> > > > > pg_sz) -
> > > > > > > >> (va + off);
> > > > > > > >>                        if (flags &
> > > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > > >>                                off += total_elt_sz -
> > > > > > > >>                                        (((uintptr_t)(va
> +
> > > off -
> > > > > 1) %
> > > > > > > >>
> > > > > > > >> total_elt_sz)
> > > +
> > > > > 1);
> > > > > > > >>                }
> > > > > > > >>
> > > > > > > >
> > > > > > > > OK. Alternatively to scaling the index with a cache line
> > > size,
> > > > > you
> > > > > > > can scale it with sizeof(uintptr_t) to be able to address
> 32
> > > > > > > or
> > > 16
> > > > > GB
> > > > > > > mempools on respectively 64 bit and 32 bit architectures.
> Both
> > > x86
> > > > > and
> > > > > > > ARM CPUs have instructions to access memory with an added
> > > offset
> > > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > > >
> > > > > > > Yes, agreed this can be done.
> > > > > > > Cache line size can also be used when
> > > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > > is not enabled.
> > > > > > > On a side note, I wanted to better understand the need for
> > > having
> > > > > the
> > > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > > >
> > > > > > The description of this field is misleading, and should be
> > > corrected.
> > > > > > The correct description would be: Don't need to align objs on
> > > cache
> > > > > lines.
> > > > > >
> > > > > > It is useful for mempools containing very small objects, to
> > > conserve
> > > > > memory.
> > > > > I think we can assume that mbuf pools are created with the
> > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use
> offset
> > > > > calculated with cache line size as the unit.
> > > >
> > > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > > Yes 😊
> > >
> > > >
> > > > I agree. And since the flag is a hint only, it can be ignored if
> the
> > > mempool
> > > > library is scaling the index with the cache line size.
> > > I do not think we should ignore the flag for reason you mention
> below.
> > >
> > > >
> > > > However, a mempool may contain other objects than mbufs, and
> those
> > > objects
> > > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> > cost
> > > a
> > > > lot of memory for such mempools.
> > > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > > use the unit as 'sizeof(uintptr_t)', if not set use cache line size
> as
> > > the unit.
> > >
> >
> > That would require that the indexing multiplier is a runtime
> parameter instead
> > of a compile time parameter. So it would have a performance penalty.
> >
> > The indexing multiplier could be compile time configurable, so it is
> a tradeoff
> > between granularity and maximum mempool size.
> I meant compile time configurable. i.e.
> 
> #ifdef MEMPOOL_F_NO_CACHE_ALIGN
> <use sizeof(uintptr_t) as the multiplier>
> #else
> <use cache line size as the multiplier> /* This should provide enough
> memory for packet buffers */
> #endif

Please note that MEMPOOL_F_NO_CACHE_ALIGN is a runtime flag passed when creating a mempool, not a compile time option.



^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [dpdk-dev] [RFC] mempool: implement index-based per core cache
  2021-11-08 16:03                             ` Morten Brørup
@ 2021-11-08 16:47                               ` Jerin Jacob
  0 siblings, 0 replies; 52+ messages in thread
From: Jerin Jacob @ 2021-11-08 16:47 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Honnappa Nagarahalli, Dharmik Thakkar, Ananyev, Konstantin,
	Olivier Matz, Andrew Rybchenko, dpdk-dev, nd, Ruifeng Wang

On Mon, Nov 8, 2021 at 9:34 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> > Nagarahalli
> > Sent: Monday, 8 November 2021 16.46
> >
> > <snip>
> > > >
> > > > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > > > based
> > > > > > > > >>>>>>>>> on
> > > > > > > > >>>>> pointer
> > > > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > > > >>>>>>>>> Replace
> > > > > > it
> > > > > > > > >>>>> with
> > > > > > > > >>>>>>>>> index-based implementation, where in each buffer
> > is
> > > > > > > > >>>>>>>>> addressed
> > > > > > > > >>>>> by
> > > > > > > > >>>>>>>>> (pool address + index)
> > > > > > > > >>>>
> > > > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > > > >>>> critical and limited resource.
> > > > > > > > >>>>
> > > > > > > > >>>> DPDK has a tendency of using pointers where indexes
> > could
> > > > be
> > > > > > used
> > > > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > > > >>>> flexibility
> > > > > > of
> > > > > > > > >>>> mixing entries from different memory pools, e.g.
> > multiple
> > > > > > > > >>>> mbuf
> > > > > > > > >> pools.
> > > > > > > > >>>>
> > > > > > > > >>
> > > > > > > > >> Agreed, thank you!
> > > > > > > > >>
> > > > > > > > >>>>>>>>
> > > > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > > > >>>>>>>> On 64-bit systems difference between pool address
> > and
> > > > > > > > >>>>>>>> it's
> > > > > > > > elem
> > > > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > > > >>>>>>> size
> > > > is
> > > > > > > > >>>>>>> more
> > > > > > > > >>>>> than 4GB?
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> That is one possible scenario.
> > > > > > > > >>>>
> > > > > > > > >>>> That could be solved by making the index an element
> > index
> > > > > > instead
> > > > > > > > of
> > > > > > > > >> a
> > > > > > > > >>>> pointer offset: address = (pool address + index *
> > element
> > > > > > size).
> > > > > > > > >>>
> > > > > > > > >>> Or instead of scaling the index with the element size,
> > > > which
> > > > > > > > >>> is
> > > > > > > > only
> > > > > > > > >> known at runtime, the index could be more efficiently
> > > > > > > > >> scaled
> > > > by
> > > > > > a
> > > > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > > > that
> > > > > > would
> > > > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > > > >>>
> > > > > > > > >>
> > > > > > > > >> Looking at this snippet [1] from
> > > > > > rte_mempool_op_populate_helper(),
> > > > > > > > >> there is an ‘offset’ added to avoid objects to cross
> > page
> > > > > > > > boundaries.
> > > > > > > > >> If my understanding is correct, using the index of
> > element
> > > > > > instead
> > > > > > > > of a
> > > > > > > > >> pointer offset will pose a challenge for some of the
> > corner
> > > > > > cases.
> > > > > > > > >>
> > > > > > > > >> [1]
> > > > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > > > >>                /* avoid objects to cross page boundaries
> > */
> > > > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > > > total_elt_sz)
> > > > > > > > >> <
> > > > > > > > >> 0) {
> > > > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va +
> > off,
> > > > > > pg_sz) -
> > > > > > > > >> (va + off);
> > > > > > > > >>                        if (flags &
> > > > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > > > >>                                off += total_elt_sz -
> > > > > > > > >>                                        (((uintptr_t)(va
> > +
> > > > off -
> > > > > > 1) %
> > > > > > > > >>
> > > > > > > > >> total_elt_sz)
> > > > +
> > > > > > 1);
> > > > > > > > >>                }
> > > > > > > > >>
> > > > > > > > >
> > > > > > > > > OK. Alternatively to scaling the index with a cache line
> > > > size,
> > > > > > you
> > > > > > > > can scale it with sizeof(uintptr_t) to be able to address
> > 32
> > > > > > > > or
> > > > 16
> > > > > > GB
> > > > > > > > mempools on respectively 64 bit and 32 bit architectures.
> > Both
> > > > x86
> > > > > > and
> > > > > > > > ARM CPUs have instructions to access memory with an added
> > > > offset
> > > > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > > > >
> > > > > > > > Yes, agreed this can be done.
> > > > > > > > Cache line size can also be used when
> > > > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > > > is not enabled.
> > > > > > > > On a side note, I wanted to better understand the need for
> > > > having
> > > > > > the
> > > > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > > > >
> > > > > > > The description of this field is misleading, and should be
> > > > corrected.
> > > > > > > The correct description would be: Don't need to align objs on
> > > > cache
> > > > > > lines.
> > > > > > >
> > > > > > > It is useful for mempools containing very small objects, to
> > > > conserve
> > > > > > memory.
> > > > > > I think we can assume that mbuf pools are created with the
> > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use
> > offset
> > > > > > calculated with cache line size as the unit.
> > > > >
> > > > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > > > Yes 😊
> > > >
> > > > >
> > > > > I agree. And since the flag is a hint only, it can be ignored if
> > the
> > > > mempool
> > > > > library is scaling the index with the cache line size.
> > > > I do not think we should ignore the flag for reason you mention
> > below.
> > > >
> > > > >
> > > > > However, a mempool may contain other objects than mbufs, and
> > those
> > > > objects
> > > > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> > > cost
> > > > a
> > > > > lot of memory for such mempools.
> > > > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > > > use the unit as 'sizeof(uintptr_t)', if not set use cache line size
> > as
> > > > the unit.
> > > >
> > >
> > > That would require that the indexing multiplier is a runtime
> > parameter instead
> > > of a compile time parameter. So it would have a performance penalty.
> > >
> > > The indexing multiplier could be compile time configurable, so it is
> > a tradeoff
> > > between granularity and maximum mempool size.
> > I meant compile time configurable. i.e.
> >
> > #ifdef MEMPOOL_F_NO_CACHE_ALIGN
> > <use sizeof(uintptr_t) as the multiplier>
> > #else
> > <use cache line size as the multiplier> /* This should provide enough
> > memory for packet buffers */
> > #endif
>
> Please note that MEMPOOL_F_NO_CACHE_ALIGN is a runtime flag passed when creating a mempool, not a compile time option.

Also, Please share  PMU counters stats on L1 and L2 miss with or
without this scheme after the rework. IMO, we should not have any
regression on
1) Per core mpps
OR
2) L1 and L2 misses.
with l3fwd/testpmd/l2fwd etc,


>
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 0/1] mempool: implement index-based per core cache
  2021-09-30 17:27 [dpdk-dev] [RFC] mempool: implement index-based per core cache Dharmik Thakkar
  2021-10-01 12:36 ` Jerin Jacob
  2021-10-01 21:30 ` Ananyev, Konstantin
@ 2021-12-24 22:59 ` Dharmik Thakkar
  2021-12-24 22:59   ` [PATCH 1/1] " Dharmik Thakkar
                     ` (2 more replies)
  2 siblings, 3 replies; 52+ messages in thread
From: Dharmik Thakkar @ 2021-12-24 22:59 UTC (permalink / raw)
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang, Dharmik Thakkar

Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache

L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput

Micro-benchmarking the patch using mempool_perf_test shows
significant improvement with majority of the test cases

Number of cores = 1:
n_get_bulk=1 n_put_bulk=1 n_keep=32 %_change_with_patch=18.01
n_get_bulk=1 n_put_bulk=1 n_keep=128 %_change_with_patch=19.91
n_get_bulk=1 n_put_bulk=4 n_keep=32 %_change_with_patch=-20.37 (regression)
n_get_bulk=1 n_put_bulk=4 n_keep=128 %_change_with_patch=-17.01 (regression) 
n_get_bulk=1 n_put_bulk=32 n_keep=32 %_change_with_patch=-25.06 (regression)
n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.81 (regression)
n_get_bulk=4 n_put_bulk=1 n_keep=32 %_change_with_patch=53.93
n_get_bulk=4 n_put_bulk=1 n_keep=128 %_change_with_patch=60.90
n_get_bulk=4 n_put_bulk=4 n_keep=32 %_change_with_patch=1.64
n_get_bulk=4 n_put_bulk=4 n_keep=128 %_change_with_patch=8.76
n_get_bulk=4 n_put_bulk=32 n_keep=32 %_change_with_patch=-4.71 (regression)
n_get_bulk=4 n_put_bulk=32 n_keep=128 %_change_with_patch=-3.19 (regression)
n_get_bulk=32 n_put_bulk=1 n_keep=32 %_change_with_patch=65.63
n_get_bulk=32 n_put_bulk=1 n_keep=128 %_change_with_patch=75.19
n_get_bulk=32 n_put_bulk=4 n_keep=32 %_change_with_patch=11.75
n_get_bulk=32 n_put_bulk=4 n_keep=128 %_change_with_patch=15.52
n_get_bulk=32 n_put_bulk=32 n_keep=32 %_change_with_patch=13.45
n_get_bulk=32 n_put_bulk=32 n_keep=128 %_change_with_patch=11.58

Number of core = 2:
n_get_bulk=1 n_put_bulk=1 n_keep=32 %_change_with_patch=18.21
n_get_bulk=1 n_put_bulk=1 n_keep=128 %_change_with_patch=21.89
n_get_bulk=1 n_put_bulk=4 n_keep=32 %_change_with_patch=-21.21 (regression)
n_get_bulk=1 n_put_bulk=4 n_keep=128 %_change_with_patch=-17.05 (regression)
n_get_bulk=1 n_put_bulk=32 n_keep=32 %_change_with_patch=-26.09 (regression)
n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.49 (regression)
n_get_bulk=4 n_put_bulk=1 n_keep=32 %_change_with_patch=56.28
n_get_bulk=4 n_put_bulk=1 n_keep=128 %_change_with_patch=67.69
n_get_bulk=4 n_put_bulk=4 n_keep=32 %_change_with_patch=1.45
n_get_bulk=4 n_put_bulk=4 n_keep=128 %_change_with_patch=8.84
n_get_bulk=4 n_put_bulk=32 n_keep=32 %_change_with_patch=-5.27 (regression)
n_get_bulk=4 n_put_bulk=32 n_keep=128 %_change_with_patch=-3.09 (regression)
n_get_bulk=32 n_put_bulk=1 n_keep=32 %_change_with_patch=76.11
n_get_bulk=32 n_put_bulk=1 n_keep=128 %_change_with_patch=86.06
n_get_bulk=32 n_put_bulk=4 n_keep=32 %_change_with_patch=11.86
n_get_bulk=32 n_put_bulk=4 n_keep=128 %_change_with_patch=16.55
n_get_bulk=32 n_put_bulk=32 n_keep=32 %_change_with_patch=13.01
n_get_bulk=32 n_put_bulk=32 n_keep=128 %_change_with_patch=11.51


From analyzing the results, it is clear that for n_get_bulk and
n_put_bulk sizes of 32 there is no performance regression
IMO, the other sizes are not practical from performance perspective
and the regression in those cases can be safely ignored

Dharmik Thakkar (1):
  mempool: implement index-based per core cache

 lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
 lib/mempool/rte_mempool_ops_default.c |   7 ++
 2 files changed, 119 insertions(+), 2 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH 1/1] mempool: implement index-based per core cache
  2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
@ 2021-12-24 22:59   ` Dharmik Thakkar
  2022-01-11  2:26     ` Ananyev, Konstantin
  2021-12-25  0:16   ` [PATCH 0/1] " Morten Brørup
  2022-01-13  5:36   ` [PATCH v2 " Dharmik Thakkar
  2 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2021-12-24 22:59 UTC (permalink / raw)
  To: Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang, Dharmik Thakkar

Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache

L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
 lib/mempool/rte_mempool_ops_default.c |   7 ++
 2 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1e7a3c15273c..4fabd3b1920b 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -50,6 +50,10 @@
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#include <rte_vect.h>
+#endif
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -239,6 +243,9 @@ struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	void *pool_base_value; /**< Base value to calculate indices */
+#endif
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = mp->pool_base_value;
+	uint32_t *cache_objs = (uint32_t *) cache->objs;
+	for (i = 0; i < cache_len; i++)
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
+#else
 	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+#endif
 	cache->len = 0;
 }
 
@@ -1334,8 +1353,13 @@ static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 			   unsigned int n, struct rte_mempool_cache *cache)
 {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
+#else
 	void **cache_objs;
-
+#endif
 	/* increment stat now, adding in mempool always success */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
@@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
 		goto ring_enqueue;
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	cache_objs = (uint32_t *) cache->objs;
+	cache_objs = &cache_objs[cache->len];
+	base_value = mp->pool_base_value;
+#else
 	cache_objs = &cache->objs[cache->len];
+#endif
 
 	/*
 	 * The cache follows the following algorithm
@@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
+	for (i = 0; i < (n & ~0x1); i += 2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+	if (n & 0x1) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#else
+	for (i = 0; i < n; i++) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#endif
+#else
 	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
+				cache->len - cache->size);
+#else
 		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
 				cache->len - cache->size);
+#endif
 		cache->len = cache->size;
 	}
 
@@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	uint32_t index, len;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	uint32_t i;
+	uint32_t *cache_objs;
+#else
 	void **cache_objs;
-
+#endif
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	void *base_value = mp->pool_base_value;
+	cache_objs = (uint32_t *) cache->objs;
+#else
 	cache_objs = cache->objs;
+#endif
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
@@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache->size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			temp_objs, req);
+#else
 		ret = rte_mempool_ops_dequeue_bulk(mp,
 			&cache->objs[cache->len], req);
+#endif
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
+								base_value);
+		}
+#endif
 		cache->len += req;
 	}
 
 	/* Now fill in the response ... */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
+	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
+						len -= 4, obj_table += 4) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
+	}
+	switch (n & 0x3) {
+	case 3:
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+								/* fallthrough */
+	case 2:
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+								/* fallthrough */
+	case 1:
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+	}
+#else
+	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
 		*obj_table = cache_objs[len];
+#endif
 
 	cache->len -= n;
 
diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
index 22fccf9d7619..3543cad9d4ce 100644
--- a/lib/mempool/rte_mempool_ops_default.c
+++ b/lib/mempool/rte_mempool_ops_default.c
@@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
 		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		/* Store pool base value to calculate indices for index-based
+		 * lcore cache implementation
+		 */
+		if (i == 0)
+			mp->pool_base_value = obj;
+#endif
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 		off += mp->elt_size + mp->trailer_size;
 	}
-- 
2.25.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 0/1] mempool: implement index-based per core cache
  2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
  2021-12-24 22:59   ` [PATCH 1/1] " Dharmik Thakkar
@ 2021-12-25  0:16   ` Morten Brørup
  2022-01-07 11:15     ` Bruce Richardson
  2022-01-13  5:36   ` [PATCH v2 " Dharmik Thakkar
  2 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2021-12-25  0:16 UTC (permalink / raw)
  To: Dharmik Thakkar; +Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang

> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
> Sent: Friday, 24 December 2021 23.59
> 
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput
> 
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
> 
> Number of cores = 1:
> n_get_bulk=1 n_put_bulk=1 n_keep=32 %_change_with_patch=18.01
> n_get_bulk=1 n_put_bulk=1 n_keep=128 %_change_with_patch=19.91
> n_get_bulk=1 n_put_bulk=4 n_keep=32 %_change_with_patch=-20.37
> (regression)
> n_get_bulk=1 n_put_bulk=4 n_keep=128 %_change_with_patch=-17.01
> (regression)
> n_get_bulk=1 n_put_bulk=32 n_keep=32 %_change_with_patch=-25.06
> (regression)
> n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.81
> (regression)
> n_get_bulk=4 n_put_bulk=1 n_keep=32 %_change_with_patch=53.93
> n_get_bulk=4 n_put_bulk=1 n_keep=128 %_change_with_patch=60.90
> n_get_bulk=4 n_put_bulk=4 n_keep=32 %_change_with_patch=1.64
> n_get_bulk=4 n_put_bulk=4 n_keep=128 %_change_with_patch=8.76
> n_get_bulk=4 n_put_bulk=32 n_keep=32 %_change_with_patch=-4.71
> (regression)
> n_get_bulk=4 n_put_bulk=32 n_keep=128 %_change_with_patch=-3.19
> (regression)
> n_get_bulk=32 n_put_bulk=1 n_keep=32 %_change_with_patch=65.63
> n_get_bulk=32 n_put_bulk=1 n_keep=128 %_change_with_patch=75.19
> n_get_bulk=32 n_put_bulk=4 n_keep=32 %_change_with_patch=11.75
> n_get_bulk=32 n_put_bulk=4 n_keep=128 %_change_with_patch=15.52
> n_get_bulk=32 n_put_bulk=32 n_keep=32 %_change_with_patch=13.45
> n_get_bulk=32 n_put_bulk=32 n_keep=128 %_change_with_patch=11.58
> 
> Number of core = 2:
> n_get_bulk=1 n_put_bulk=1 n_keep=32 %_change_with_patch=18.21
> n_get_bulk=1 n_put_bulk=1 n_keep=128 %_change_with_patch=21.89
> n_get_bulk=1 n_put_bulk=4 n_keep=32 %_change_with_patch=-21.21
> (regression)
> n_get_bulk=1 n_put_bulk=4 n_keep=128 %_change_with_patch=-17.05
> (regression)
> n_get_bulk=1 n_put_bulk=32 n_keep=32 %_change_with_patch=-26.09
> (regression)
> n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.49
> (regression)
> n_get_bulk=4 n_put_bulk=1 n_keep=32 %_change_with_patch=56.28
> n_get_bulk=4 n_put_bulk=1 n_keep=128 %_change_with_patch=67.69
> n_get_bulk=4 n_put_bulk=4 n_keep=32 %_change_with_patch=1.45
> n_get_bulk=4 n_put_bulk=4 n_keep=128 %_change_with_patch=8.84
> n_get_bulk=4 n_put_bulk=32 n_keep=32 %_change_with_patch=-5.27
> (regression)
> n_get_bulk=4 n_put_bulk=32 n_keep=128 %_change_with_patch=-3.09
> (regression)
> n_get_bulk=32 n_put_bulk=1 n_keep=32 %_change_with_patch=76.11
> n_get_bulk=32 n_put_bulk=1 n_keep=128 %_change_with_patch=86.06
> n_get_bulk=32 n_put_bulk=4 n_keep=32 %_change_with_patch=11.86
> n_get_bulk=32 n_put_bulk=4 n_keep=128 %_change_with_patch=16.55
> n_get_bulk=32 n_put_bulk=32 n_keep=32 %_change_with_patch=13.01
> n_get_bulk=32 n_put_bulk=32 n_keep=128 %_change_with_patch=11.51
> 
> 
> From analyzing the results, it is clear that for n_get_bulk and
> n_put_bulk sizes of 32 there is no performance regression
> IMO, the other sizes are not practical from performance perspective
> and the regression in those cases can be safely ignored
> 
> Dharmik Thakkar (1):
>   mempool: implement index-based per core cache
> 
>  lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
>  lib/mempool/rte_mempool_ops_default.c |   7 ++
>  2 files changed, 119 insertions(+), 2 deletions(-)
> 
> --
> 2.25.1
> 

I still think this is very interesting. And your performance numbers are looking good.

However, it limits the size of a mempool to 4 GB. As previously discussed, the max mempool size can be increased by multiplying the index with a constant.

I would suggest using sizeof(uintptr_t) as the constant multiplier, so the mempool can hold objects of any size divisible by sizeof(uintptr_t). And it would be silly to use a mempool to hold objects smaller than sizeof(uintptr_t).

How does the performance look if you multiply the index by sizeof(uintptr_t)?


Med venlig hilsen / Kind regards,
-Morten Brørup




^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 0/1] mempool: implement index-based per core cache
  2021-12-25  0:16   ` [PATCH 0/1] " Morten Brørup
@ 2022-01-07 11:15     ` Bruce Richardson
  2022-01-07 11:29       ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Bruce Richardson @ 2022-01-07 11:15 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Dharmik Thakkar, dev, nd, honnappa.nagarahalli, ruifeng.wang

On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent: Friday, 24
> > December 2021 23.59
> > 
> > Current mempool per core cache implementation stores pointers to mbufs
> > On 64b architectures, each pointer consumes 8B This patch replaces it
> > with index-based implementation, where in each buffer is addressed by
> > (pool base address + index) It reduces the amount of memory/cache
> > required for per core cache
> > 
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance (L1 and L2 misses reduced by 0.60%) with no change in
> > throughput
> > 
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> > 
> > Number of cores = 1: n_get_bulk=1 n_put_bulk=1 n_keep=32
> > %_change_with_patch=18.01 n_get_bulk=1 n_put_bulk=1 n_keep=128
> > %_change_with_patch=19.91 n_get_bulk=1 n_put_bulk=4 n_keep=32
> > %_change_with_patch=-20.37 (regression) n_get_bulk=1 n_put_bulk=4
> > n_keep=128 %_change_with_patch=-17.01 (regression) n_get_bulk=1
> > n_put_bulk=32 n_keep=32 %_change_with_patch=-25.06 (regression)
> > n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.81
> > (regression) n_get_bulk=4 n_put_bulk=1 n_keep=32
> > %_change_with_patch=53.93 n_get_bulk=4 n_put_bulk=1 n_keep=128
> > %_change_with_patch=60.90 n_get_bulk=4 n_put_bulk=4 n_keep=32
> > %_change_with_patch=1.64 n_get_bulk=4 n_put_bulk=4 n_keep=128
> > %_change_with_patch=8.76 n_get_bulk=4 n_put_bulk=32 n_keep=32
> > %_change_with_patch=-4.71 (regression) n_get_bulk=4 n_put_bulk=32
> > n_keep=128 %_change_with_patch=-3.19 (regression) n_get_bulk=32
> > n_put_bulk=1 n_keep=32 %_change_with_patch=65.63 n_get_bulk=32
> > n_put_bulk=1 n_keep=128 %_change_with_patch=75.19 n_get_bulk=32
> > n_put_bulk=4 n_keep=32 %_change_with_patch=11.75 n_get_bulk=32
> > n_put_bulk=4 n_keep=128 %_change_with_patch=15.52 n_get_bulk=32
> > n_put_bulk=32 n_keep=32 %_change_with_patch=13.45 n_get_bulk=32
> > n_put_bulk=32 n_keep=128 %_change_with_patch=11.58
> > 
> > Number of core = 2: n_get_bulk=1 n_put_bulk=1 n_keep=32
> > %_change_with_patch=18.21 n_get_bulk=1 n_put_bulk=1 n_keep=128
> > %_change_with_patch=21.89 n_get_bulk=1 n_put_bulk=4 n_keep=32
> > %_change_with_patch=-21.21 (regression) n_get_bulk=1 n_put_bulk=4
> > n_keep=128 %_change_with_patch=-17.05 (regression) n_get_bulk=1
> > n_put_bulk=32 n_keep=32 %_change_with_patch=-26.09 (regression)
> > n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.49
> > (regression) n_get_bulk=4 n_put_bulk=1 n_keep=32
> > %_change_with_patch=56.28 n_get_bulk=4 n_put_bulk=1 n_keep=128
> > %_change_with_patch=67.69 n_get_bulk=4 n_put_bulk=4 n_keep=32
> > %_change_with_patch=1.45 n_get_bulk=4 n_put_bulk=4 n_keep=128
> > %_change_with_patch=8.84 n_get_bulk=4 n_put_bulk=32 n_keep=32
> > %_change_with_patch=-5.27 (regression) n_get_bulk=4 n_put_bulk=32
> > n_keep=128 %_change_with_patch=-3.09 (regression) n_get_bulk=32
> > n_put_bulk=1 n_keep=32 %_change_with_patch=76.11 n_get_bulk=32
> > n_put_bulk=1 n_keep=128 %_change_with_patch=86.06 n_get_bulk=32
> > n_put_bulk=4 n_keep=32 %_change_with_patch=11.86 n_get_bulk=32
> > n_put_bulk=4 n_keep=128 %_change_with_patch=16.55 n_get_bulk=32
> > n_put_bulk=32 n_keep=32 %_change_with_patch=13.01 n_get_bulk=32
> > n_put_bulk=32 n_keep=128 %_change_with_patch=11.51
> > 
> > 
> > From analyzing the results, it is clear that for n_get_bulk and
> > n_put_bulk sizes of 32 there is no performance regression IMO, the
> > other sizes are not practical from performance perspective and the
> > regression in those cases can be safely ignored
> > 
> > Dharmik Thakkar (1): mempool: implement index-based per core cache
> > 
> >  lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
> >  lib/mempool/rte_mempool_ops_default.c |   7 ++ 2 files changed, 119
> >  insertions(+), 2 deletions(-)
> > 
> > -- 2.25.1
> > 
> 
> I still think this is very interesting. And your performance numbers are
> looking good.
> 
> However, it limits the size of a mempool to 4 GB. As previously
> discussed, the max mempool size can be increased by multiplying the index
> with a constant.
> 
> I would suggest using sizeof(uintptr_t) as the constant multiplier, so
> the mempool can hold objects of any size divisible by sizeof(uintptr_t).
> And it would be silly to use a mempool to hold objects smaller than
> sizeof(uintptr_t).
> 
> How does the performance look if you multiply the index by
> sizeof(uintptr_t)?
> 

Each mempool entry is cache aligned, so we can use that if we want a bigger
multiplier.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 0/1] mempool: implement index-based per core cache
  2022-01-07 11:15     ` Bruce Richardson
@ 2022-01-07 11:29       ` Morten Brørup
  2022-01-07 13:50         ` Bruce Richardson
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2022-01-07 11:29 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Dharmik Thakkar, dev, nd, honnappa.nagarahalli, ruifeng.wang

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, 7 January 2022 12.16
> 
> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> Friday, 24
> > > December 2021 23.59
> > >
> > > Current mempool per core cache implementation stores pointers to
> mbufs
> > > On 64b architectures, each pointer consumes 8B This patch replaces
> it
> > > with index-based implementation, where in each buffer is addressed
> by
> > > (pool base address + index) It reduces the amount of memory/cache
> > > required for per core cache
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance (L1 and L2 misses reduced by 0.60%) with no change in
> > > throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows
> significant
> > > improvement with majority of the test cases
> > >
> > > Number of cores = 1: n_get_bulk=1 n_put_bulk=1 n_keep=32
> > > %_change_with_patch=18.01 n_get_bulk=1 n_put_bulk=1 n_keep=128
> > > %_change_with_patch=19.91 n_get_bulk=1 n_put_bulk=4 n_keep=32
> > > %_change_with_patch=-20.37 (regression) n_get_bulk=1 n_put_bulk=4
> > > n_keep=128 %_change_with_patch=-17.01 (regression) n_get_bulk=1
> > > n_put_bulk=32 n_keep=32 %_change_with_patch=-25.06 (regression)
> > > n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.81
> > > (regression) n_get_bulk=4 n_put_bulk=1 n_keep=32
> > > %_change_with_patch=53.93 n_get_bulk=4 n_put_bulk=1 n_keep=128
> > > %_change_with_patch=60.90 n_get_bulk=4 n_put_bulk=4 n_keep=32
> > > %_change_with_patch=1.64 n_get_bulk=4 n_put_bulk=4 n_keep=128
> > > %_change_with_patch=8.76 n_get_bulk=4 n_put_bulk=32 n_keep=32
> > > %_change_with_patch=-4.71 (regression) n_get_bulk=4 n_put_bulk=32
> > > n_keep=128 %_change_with_patch=-3.19 (regression) n_get_bulk=32
> > > n_put_bulk=1 n_keep=32 %_change_with_patch=65.63 n_get_bulk=32
> > > n_put_bulk=1 n_keep=128 %_change_with_patch=75.19 n_get_bulk=32
> > > n_put_bulk=4 n_keep=32 %_change_with_patch=11.75 n_get_bulk=32
> > > n_put_bulk=4 n_keep=128 %_change_with_patch=15.52 n_get_bulk=32
> > > n_put_bulk=32 n_keep=32 %_change_with_patch=13.45 n_get_bulk=32
> > > n_put_bulk=32 n_keep=128 %_change_with_patch=11.58
> > >
> > > Number of core = 2: n_get_bulk=1 n_put_bulk=1 n_keep=32
> > > %_change_with_patch=18.21 n_get_bulk=1 n_put_bulk=1 n_keep=128
> > > %_change_with_patch=21.89 n_get_bulk=1 n_put_bulk=4 n_keep=32
> > > %_change_with_patch=-21.21 (regression) n_get_bulk=1 n_put_bulk=4
> > > n_keep=128 %_change_with_patch=-17.05 (regression) n_get_bulk=1
> > > n_put_bulk=32 n_keep=32 %_change_with_patch=-26.09 (regression)
> > > n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.49
> > > (regression) n_get_bulk=4 n_put_bulk=1 n_keep=32
> > > %_change_with_patch=56.28 n_get_bulk=4 n_put_bulk=1 n_keep=128
> > > %_change_with_patch=67.69 n_get_bulk=4 n_put_bulk=4 n_keep=32
> > > %_change_with_patch=1.45 n_get_bulk=4 n_put_bulk=4 n_keep=128
> > > %_change_with_patch=8.84 n_get_bulk=4 n_put_bulk=32 n_keep=32
> > > %_change_with_patch=-5.27 (regression) n_get_bulk=4 n_put_bulk=32
> > > n_keep=128 %_change_with_patch=-3.09 (regression) n_get_bulk=32
> > > n_put_bulk=1 n_keep=32 %_change_with_patch=76.11 n_get_bulk=32
> > > n_put_bulk=1 n_keep=128 %_change_with_patch=86.06 n_get_bulk=32
> > > n_put_bulk=4 n_keep=32 %_change_with_patch=11.86 n_get_bulk=32
> > > n_put_bulk=4 n_keep=128 %_change_with_patch=16.55 n_get_bulk=32
> > > n_put_bulk=32 n_keep=32 %_change_with_patch=13.01 n_get_bulk=32
> > > n_put_bulk=32 n_keep=128 %_change_with_patch=11.51
> > >
> > >
> > > From analyzing the results, it is clear that for n_get_bulk and
> > > n_put_bulk sizes of 32 there is no performance regression IMO, the
> > > other sizes are not practical from performance perspective and the
> > > regression in those cases can be safely ignored
> > >
> > > Dharmik Thakkar (1): mempool: implement index-based per core cache
> > >
> > >  lib/mempool/rte_mempool.h             | 114
> +++++++++++++++++++++++++-
> > >  lib/mempool/rte_mempool_ops_default.c |   7 ++ 2 files changed,
> 119
> > >  insertions(+), 2 deletions(-)
> > >
> > > -- 2.25.1
> > >
> >
> > I still think this is very interesting. And your performance numbers
> are
> > looking good.
> >
> > However, it limits the size of a mempool to 4 GB. As previously
> > discussed, the max mempool size can be increased by multiplying the
> index
> > with a constant.
> >
> > I would suggest using sizeof(uintptr_t) as the constant multiplier,
> so
> > the mempool can hold objects of any size divisible by
> sizeof(uintptr_t).
> > And it would be silly to use a mempool to hold objects smaller than
> > sizeof(uintptr_t).
> >
> > How does the performance look if you multiply the index by
> > sizeof(uintptr_t)?
> >
> 
> Each mempool entry is cache aligned, so we can use that if we want a
> bigger
> multiplier.

Thanks for chiming in, Bruce.

Please also read this discussion about the multiplier:
http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 0/1] mempool: implement index-based per core cache
  2022-01-07 11:29       ` Morten Brørup
@ 2022-01-07 13:50         ` Bruce Richardson
  2022-01-08  9:37           ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Bruce Richardson @ 2022-01-07 13:50 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Dharmik Thakkar, dev, nd, honnappa.nagarahalli, ruifeng.wang

On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Friday, 7 January 2022 12.16
> > 
> > On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> > Friday, 24
> > > > December 2021 23.59
> > > >
> > > > Current mempool per core cache implementation stores pointers to
> > mbufs
> > > > On 64b architectures, each pointer consumes 8B This patch replaces
> > it
> > > > with index-based implementation, where in each buffer is addressed
> > by
> > > > (pool base address + index) It reduces the amount of memory/cache
> > > > required for per core cache
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance (L1 and L2 misses reduced by 0.60%) with no change in
> > > > throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows
> > significant
> > > > improvement with majority of the test cases
> > > >
> > > > Number of cores = 1: n_get_bulk=1 n_put_bulk=1 n_keep=32
> > > > %_change_with_patch=18.01 n_get_bulk=1 n_put_bulk=1 n_keep=128
> > > > %_change_with_patch=19.91 n_get_bulk=1 n_put_bulk=4 n_keep=32
> > > > %_change_with_patch=-20.37 (regression) n_get_bulk=1 n_put_bulk=4
> > > > n_keep=128 %_change_with_patch=-17.01 (regression) n_get_bulk=1
> > > > n_put_bulk=32 n_keep=32 %_change_with_patch=-25.06 (regression)
> > > > n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.81
> > > > (regression) n_get_bulk=4 n_put_bulk=1 n_keep=32
> > > > %_change_with_patch=53.93 n_get_bulk=4 n_put_bulk=1 n_keep=128
> > > > %_change_with_patch=60.90 n_get_bulk=4 n_put_bulk=4 n_keep=32
> > > > %_change_with_patch=1.64 n_get_bulk=4 n_put_bulk=4 n_keep=128
> > > > %_change_with_patch=8.76 n_get_bulk=4 n_put_bulk=32 n_keep=32
> > > > %_change_with_patch=-4.71 (regression) n_get_bulk=4 n_put_bulk=32
> > > > n_keep=128 %_change_with_patch=-3.19 (regression) n_get_bulk=32
> > > > n_put_bulk=1 n_keep=32 %_change_with_patch=65.63 n_get_bulk=32
> > > > n_put_bulk=1 n_keep=128 %_change_with_patch=75.19 n_get_bulk=32
> > > > n_put_bulk=4 n_keep=32 %_change_with_patch=11.75 n_get_bulk=32
> > > > n_put_bulk=4 n_keep=128 %_change_with_patch=15.52 n_get_bulk=32
> > > > n_put_bulk=32 n_keep=32 %_change_with_patch=13.45 n_get_bulk=32
> > > > n_put_bulk=32 n_keep=128 %_change_with_patch=11.58
> > > >
> > > > Number of core = 2: n_get_bulk=1 n_put_bulk=1 n_keep=32
> > > > %_change_with_patch=18.21 n_get_bulk=1 n_put_bulk=1 n_keep=128
> > > > %_change_with_patch=21.89 n_get_bulk=1 n_put_bulk=4 n_keep=32
> > > > %_change_with_patch=-21.21 (regression) n_get_bulk=1 n_put_bulk=4
> > > > n_keep=128 %_change_with_patch=-17.05 (regression) n_get_bulk=1
> > > > n_put_bulk=32 n_keep=32 %_change_with_patch=-26.09 (regression)
> > > > n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.49
> > > > (regression) n_get_bulk=4 n_put_bulk=1 n_keep=32
> > > > %_change_with_patch=56.28 n_get_bulk=4 n_put_bulk=1 n_keep=128
> > > > %_change_with_patch=67.69 n_get_bulk=4 n_put_bulk=4 n_keep=32
> > > > %_change_with_patch=1.45 n_get_bulk=4 n_put_bulk=4 n_keep=128
> > > > %_change_with_patch=8.84 n_get_bulk=4 n_put_bulk=32 n_keep=32
> > > > %_change_with_patch=-5.27 (regression) n_get_bulk=4 n_put_bulk=32
> > > > n_keep=128 %_change_with_patch=-3.09 (regression) n_get_bulk=32
> > > > n_put_bulk=1 n_keep=32 %_change_with_patch=76.11 n_get_bulk=32
> > > > n_put_bulk=1 n_keep=128 %_change_with_patch=86.06 n_get_bulk=32
> > > > n_put_bulk=4 n_keep=32 %_change_with_patch=11.86 n_get_bulk=32
> > > > n_put_bulk=4 n_keep=128 %_change_with_patch=16.55 n_get_bulk=32
> > > > n_put_bulk=32 n_keep=32 %_change_with_patch=13.01 n_get_bulk=32
> > > > n_put_bulk=32 n_keep=128 %_change_with_patch=11.51
> > > >
> > > >
> > > > From analyzing the results, it is clear that for n_get_bulk and
> > > > n_put_bulk sizes of 32 there is no performance regression IMO, the
> > > > other sizes are not practical from performance perspective and the
> > > > regression in those cases can be safely ignored
> > > >
> > > > Dharmik Thakkar (1): mempool: implement index-based per core cache
> > > >
> > > >  lib/mempool/rte_mempool.h             | 114
> > +++++++++++++++++++++++++-
> > > >  lib/mempool/rte_mempool_ops_default.c |   7 ++ 2 files changed,
> > 119
> > > >  insertions(+), 2 deletions(-)
> > > >
> > > > -- 2.25.1
> > > >
> > >
> > > I still think this is very interesting. And your performance numbers
> > are
> > > looking good.
> > >
> > > However, it limits the size of a mempool to 4 GB. As previously
> > > discussed, the max mempool size can be increased by multiplying the
> > index
> > > with a constant.
> > >
> > > I would suggest using sizeof(uintptr_t) as the constant multiplier,
> > so
> > > the mempool can hold objects of any size divisible by
> > sizeof(uintptr_t).
> > > And it would be silly to use a mempool to hold objects smaller than
> > > sizeof(uintptr_t).
> > >
> > > How does the performance look if you multiply the index by
> > > sizeof(uintptr_t)?
> > >
> > 
> > Each mempool entry is cache aligned, so we can use that if we want a
> > bigger
> > multiplier.
> 
> Thanks for chiming in, Bruce.
> 
> Please also read this discussion about the multiplier:
> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
>

I actually wondered after I had sent the email whether we had indeed an
option to disable the cache alignment or not! Thanks for pointing out that
we do. This brings a couple additional thoughts:

* Using indexes for the cache should probably be a runtime flag rather than
  a build-time one.
* It would seem reasonable to me to disallow use of the indexed-cache flag
  and the non-cache aligned flag simultaneously.
* On the offchance that that restriction is unacceptable, then we can
  make things a little more complicated by doing a runtime computation of
  the "index-shiftwidth" to use.

Overall, I think defaulting to cacheline shiftwidth and disallowing
index-based addressing when using unaligned buffers is simplest and easiest
unless we can come up with a valid usecase for needing more than that.

/Bruce

^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 0/1] mempool: implement index-based per core cache
  2022-01-07 13:50         ` Bruce Richardson
@ 2022-01-08  9:37           ` Morten Brørup
  2022-01-10  6:38             ` Jerin Jacob
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2022-01-08  9:37 UTC (permalink / raw)
  To: Bruce Richardson, Dharmik Thakkar, honnappa.nagarahalli
  Cc: dev, nd, ruifeng.wang

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, 7 January 2022 14.51
> 
> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Friday, 7 January 2022 12.16
> > >
> > > On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> > > Friday, 24
> > > > > December 2021 23.59
> > > > >
> > > > > Current mempool per core cache implementation stores pointers
> to
> > > mbufs
> > > > > On 64b architectures, each pointer consumes 8B This patch
> replaces
> > > it
> > > > > with index-based implementation, where in each buffer is
> addressed
> > > by
> > > > > (pool base address + index) It reduces the amount of
> memory/cache
> > > > > required for per core cache
> > > > >
> > > > > L3Fwd performance testing reveals minor improvements in the
> cache
> > > > > performance (L1 and L2 misses reduced by 0.60%) with no change
> in
> > > > > throughput
> > > > >
> > > > > Micro-benchmarking the patch using mempool_perf_test shows
> > > significant
> > > > > improvement with majority of the test cases
> > > > >
> > > >
> > > > I still think this is very interesting. And your performance
> numbers
> > > are
> > > > looking good.
> > > >
> > > > However, it limits the size of a mempool to 4 GB. As previously
> > > > discussed, the max mempool size can be increased by multiplying
> the
> > > index
> > > > with a constant.
> > > >
> > > > I would suggest using sizeof(uintptr_t) as the constant
> multiplier,
> > > so
> > > > the mempool can hold objects of any size divisible by
> > > sizeof(uintptr_t).
> > > > And it would be silly to use a mempool to hold objects smaller
> than
> > > > sizeof(uintptr_t).
> > > >
> > > > How does the performance look if you multiply the index by
> > > > sizeof(uintptr_t)?
> > > >
> > >
> > > Each mempool entry is cache aligned, so we can use that if we want
> a
> > > bigger
> > > multiplier.
> >
> > Thanks for chiming in, Bruce.
> >
> > Please also read this discussion about the multiplier:
> > http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
> >
> 
> I actually wondered after I had sent the email whether we had indeed an
> option to disable the cache alignment or not! Thanks for pointing out
> that
> we do. This brings a couple additional thoughts:
> 
> * Using indexes for the cache should probably be a runtime flag rather
> than
>   a build-time one.
> * It would seem reasonable to me to disallow use of the indexed-cache
> flag
>   and the non-cache aligned flag simultaneously.
> * On the offchance that that restriction is unacceptable, then we can
>   make things a little more complicated by doing a runtime computation
> of
>   the "index-shiftwidth" to use.
> 
> Overall, I think defaulting to cacheline shiftwidth and disallowing
> index-based addressing when using unaligned buffers is simplest and
> easiest
> unless we can come up with a valid usecase for needing more than that.
> 
> /Bruce

This feature is a performance optimization.

With that in mind, it should not introduce function pointers or similar run-time checks or in the fast path, to determine what kind of cache to use per mempool. And if an index multiplier is implemented, it should be a compile time constant, probably something between sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).

The patch comes with a tradeoff between better performance and limited mempool size, and possibly some limitations regarding very small objects that are not cache line aligned to avoid wasting memory (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).

With no multiplier, the only tradeoff is that the mempool size is limited to 4 GB.

If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the mempool size is limited to 32 GB. (And a waste of memory for objects smaller than 8 byte; but I don't think anyone would use a mempool to hold objects smaller than 8 byte.)

If the multiplier is larger (i.e. 64 bytes cache line size), the mempool size is instead limited to 256 GB, but RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.

Note: 32 bit platforms have no benefit from this patch: The pointer already only uses 4 bytes, so replacing the pointer with a 4 byte index makes no difference.


Since this feature is a performance optimization only, and doesn't provide any new features, I don't mind it being a compile time option.

If this feature is a compile time option, and the mempool library is compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the public header file, so compilation of applications using the flag will fail. And rte_mempool_create() could RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags parameter, or emit a warning about the flag being ignored. Obviously, rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger than the library supports, possibly emitting a message that the mempool library should be built without this feature to support the larger mempool.

Here is another thought: If only exotic applications use mempools larger than 32 GB, this would be a generally acceptable limit, and DPDK should use index-based cache as default, making the opposite (i.e. pointer-based cache) a compile time option instead. A similar decision was recently made for limiting the RTE_MAX_LCORE default.


Although DPDK is moving away from compile time options in order to better support Linux distros, there should be a general exception for performance and memory optimizations. Otherwise, network appliance vendors will inherit the increasing amount of DPDK bloat, and we (network appliance vendors) will eventually be forced to fork DPDK to get rid of the bloat and achieve the goals originally intended by DPDK. If anyone disagrees with the principle about a general exception for performance and memory optimizations, I would like to pass on the decision to the Techboard!


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 0/1] mempool: implement index-based per core cache
  2022-01-08  9:37           ` Morten Brørup
@ 2022-01-10  6:38             ` Jerin Jacob
  2022-01-13  5:31               ` Dharmik Thakkar
  0 siblings, 1 reply; 52+ messages in thread
From: Jerin Jacob @ 2022-01-10  6:38 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Bruce Richardson, Dharmik Thakkar, Honnappa Nagarahalli,
	dpdk-dev, nd, Ruifeng Wang (Arm Technology China)

On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Friday, 7 January 2022 14.51
> >
> > On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > Sent: Friday, 7 January 2022 12.16
> > > >
> > > > On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > > > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> > > > Friday, 24
> > > > > > December 2021 23.59
> > > > > >
> > > > > > Current mempool per core cache implementation stores pointers
> > to
> > > > mbufs
> > > > > > On 64b architectures, each pointer consumes 8B This patch
> > replaces
> > > > it
> > > > > > with index-based implementation, where in each buffer is
> > addressed
> > > > by
> > > > > > (pool base address + index) It reduces the amount of
> > memory/cache
> > > > > > required for per core cache
> > > > > >
> > > > > > L3Fwd performance testing reveals minor improvements in the
> > cache
> > > > > > performance (L1 and L2 misses reduced by 0.60%) with no change
> > in
> > > > > > throughput
> > > > > >
> > > > > > Micro-benchmarking the patch using mempool_perf_test shows
> > > > significant
> > > > > > improvement with majority of the test cases
> > > > > >
> > > > >
> > > > > I still think this is very interesting. And your performance
> > numbers
> > > > are
> > > > > looking good.
> > > > >
> > > > > However, it limits the size of a mempool to 4 GB. As previously
> > > > > discussed, the max mempool size can be increased by multiplying
> > the
> > > > index
> > > > > with a constant.
> > > > >
> > > > > I would suggest using sizeof(uintptr_t) as the constant
> > multiplier,
> > > > so
> > > > > the mempool can hold objects of any size divisible by
> > > > sizeof(uintptr_t).
> > > > > And it would be silly to use a mempool to hold objects smaller
> > than
> > > > > sizeof(uintptr_t).
> > > > >
> > > > > How does the performance look if you multiply the index by
> > > > > sizeof(uintptr_t)?
> > > > >
> > > >
> > > > Each mempool entry is cache aligned, so we can use that if we want
> > a
> > > > bigger
> > > > multiplier.
> > >
> > > Thanks for chiming in, Bruce.
> > >
> > > Please also read this discussion about the multiplier:
> > > http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
> > >
> >
> > I actually wondered after I had sent the email whether we had indeed an
> > option to disable the cache alignment or not! Thanks for pointing out
> > that
> > we do. This brings a couple additional thoughts:
> >
> > * Using indexes for the cache should probably be a runtime flag rather
> > than
> >   a build-time one.
> > * It would seem reasonable to me to disallow use of the indexed-cache
> > flag
> >   and the non-cache aligned flag simultaneously.
> > * On the offchance that that restriction is unacceptable, then we can
> >   make things a little more complicated by doing a runtime computation
> > of
> >   the "index-shiftwidth" to use.
> >
> > Overall, I think defaulting to cacheline shiftwidth and disallowing
> > index-based addressing when using unaligned buffers is simplest and
> > easiest
> > unless we can come up with a valid usecase for needing more than that.
> >
> > /Bruce
>
> This feature is a performance optimization.
>
> With that in mind, it should not introduce function pointers or similar run-time checks or in the fast path, to determine what kind of cache to use per mempool. And if an index multiplier is implemented, it should be a compile time constant, probably something between sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
>
> The patch comes with a tradeoff between better performance and limited mempool size, and possibly some limitations regarding very small objects that are not cache line aligned to avoid wasting memory (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
>
> With no multiplier, the only tradeoff is that the mempool size is limited to 4 GB.
>
> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the mempool size is limited to 32 GB. (And a waste of memory for objects smaller than 8 byte; but I don't think anyone would use a mempool to hold objects smaller than 8 byte.)
>
> If the multiplier is larger (i.e. 64 bytes cache line size), the mempool size is instead limited to 256 GB, but RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
>
> Note: 32 bit platforms have no benefit from this patch: The pointer already only uses 4 bytes, so replacing the pointer with a 4 byte index makes no difference.
>
>
> Since this feature is a performance optimization only, and doesn't provide any new features, I don't mind it being a compile time option.
>
> If this feature is a compile time option, and the mempool library is compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the public header file, so compilation of applications using the flag will fail. And rte_mempool_create() could RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags parameter, or emit a warning about the flag being ignored. Obviously, rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger than the library supports, possibly emitting a message that the mempool library should be built without this feature to support the larger mempool.
>
> Here is another thought: If only exotic applications use mempools larger than 32 GB, this would be a generally acceptable limit, and DPDK should use index-based cache as default, making the opposite (i.e. pointer-based cache) a compile time option instead. A similar decision was recently made for limiting the RTE_MAX_LCORE default.
>
>
> Although DPDK is moving away from compile time options in order to better support Linux distros, there should be a general exception for performance and memory optimizations. Otherwise, network appliance vendors will inherit the increasing amount of DPDK bloat, and we (network appliance vendors) will eventually be forced to fork DPDK to get rid of the bloat and achieve the goals originally intended by DPDK.

Agree with Morten's view on this.

>If anyone disagrees with the principle about a general exception for performance and memory optimizations, I would like to pass on the decision to the Techboard!
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 1/1] mempool: implement index-based per core cache
  2021-12-24 22:59   ` [PATCH 1/1] " Dharmik Thakkar
@ 2022-01-11  2:26     ` Ananyev, Konstantin
  2022-01-13  5:17       ` Dharmik Thakkar
  0 siblings, 1 reply; 52+ messages in thread
From: Ananyev, Konstantin @ 2022-01-11  2:26 UTC (permalink / raw)
  To: Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang



 
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput

I feel really sceptical about that patch and the whole idea in general:
- From what I read above there is no real performance improvement observed.
  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
  see below for more details). 
- Space utilization difference looks neglectable too.
- The change introduces a new build time config option with a major limitation:
   All memzones in a pool have to be within the same 4GB boundary. 
   To address it properly, extra changes will be required in init(/populate) part of the code.
   All that will complicate mempool code, will make it more error prone
   and harder to maintain.
But, as there is no real gain in return - no point to add such extra complexity at all.

Konstantin

CSX 2.1 GHz
==========

echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci

params :                                                                                                  rate_persec  	
                                                                                                                 (normal/index-based/diff %)
(with cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65

(with user-owned cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
 cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
 cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
 cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
 cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
 cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92

> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
>  lib/mempool/rte_mempool_ops_default.c |   7 ++
>  2 files changed, 119 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1e7a3c15273c..4fabd3b1920b 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -50,6 +50,10 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#include <rte_vect.h>
> +#endif
> +
>  #include "rte_mempool_trace_fp.h"
> 
>  #ifdef __cplusplus
> @@ -239,6 +243,9 @@ struct rte_mempool {
>  	int32_t ops_index;
> 
>  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	void *pool_base_value; /**< Base value to calculate indices */
> +#endif
> 
>  	uint32_t populated_size;         /**< Number of populated objects. */
>  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>  	if (cache == NULL || cache->len == 0)
>  		return;
>  	rte_mempool_trace_cache_flush(cache, mp);
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	unsigned int i;
> +	unsigned int cache_len = cache->len;
> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +	void *base_value = mp->pool_base_value;
> +	uint32_t *cache_objs = (uint32_t *) cache->objs;
> +	for (i = 0; i < cache_len; i++)
> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> +#else
>  	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +#endif
>  	cache->len = 0;
>  }
> 
> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
>  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  			   unsigned int n, struct rte_mempool_cache *cache)
>  {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	uint32_t *cache_objs;
> +	void *base_value;
> +	uint32_t i;
> +#else
>  	void **cache_objs;
> -
> +#endif
>  	/* increment stat now, adding in mempool always success */
>  	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>  	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
>  		goto ring_enqueue;
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	cache_objs = (uint32_t *) cache->objs;
> +	cache_objs = &cache_objs[cache->len];
> +	base_value = mp->pool_base_value;
> +#else
>  	cache_objs = &cache->objs[cache->len];
> +#endif
> 
>  	/*
>  	 * The cache follows the following algorithm
> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	 */
> 
>  	/* Add elements back into the cache */
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +	uint32x2_t v_cache_objs;
> +
> +	for (i = 0; i < (n & ~0x1); i += 2) {
> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +		vst1_u32(cache_objs + i, v_cache_objs);
> +	}
> +	if (n & 0x1) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#else
> +	for (i = 0; i < n; i++) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#endif
> +#else
>  	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +#endif
> 
>  	cache->len += n;
> 
>  	if (cache->len >= cache->flushthresh) {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
> +				cache->len - cache->size);
> +#else
>  		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
>  				cache->len - cache->size);
> +#endif
>  		cache->len = cache->size;
>  	}
> 
> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  {
>  	int ret;
>  	uint32_t index, len;
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	uint32_t i;
> +	uint32_t *cache_objs;
> +#else
>  	void **cache_objs;
> -
> +#endif
>  	/* No cache provided or cannot be satisfied from cache */
>  	if (unlikely(cache == NULL || n >= cache->size))
>  		goto ring_dequeue;
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	void *base_value = mp->pool_base_value;
> +	cache_objs = (uint32_t *) cache->objs;
> +#else
>  	cache_objs = cache->objs;
> +#endif
> 
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < n) {
> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  		uint32_t req = n + (cache->size - cache->len);
> 
>  		/* How many do we require i.e. number to fill the cache + the request */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +		ret = rte_mempool_ops_dequeue_bulk(mp,
> +			temp_objs, req);
> +#else
>  		ret = rte_mempool_ops_dequeue_bulk(mp,
>  			&cache->objs[cache->len], req);
> +#endif
>  		if (unlikely(ret < 0)) {
>  			/*
>  			 * In the off chance that we are buffer constrained,
> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  			goto ring_dequeue;
>  		}
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		len = cache->len;
> +		for (i = 0; i < req; ++i, ++len) {
> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> +								base_value);
> +		}
> +#endif
>  		cache->len += req;
>  	}
> 
>  	/* Now fill in the response ... */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_cache_objs;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
> +	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
> +						len -= 4, obj_table += 4) {
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
> +	}
> +	switch (n & 0x3) {
> +	case 3:
> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> +								/* fallthrough */
> +	case 2:
> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> +								/* fallthrough */
> +	case 1:
> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> +	}
> +#else
> +	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> +#else
>  	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>  		*obj_table = cache_objs[len];
> +#endif
> 
>  	cache->len -= n;
> 
> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
> index 22fccf9d7619..3543cad9d4ce 100644
> --- a/lib/mempool/rte_mempool_ops_default.c
> +++ b/lib/mempool/rte_mempool_ops_default.c
> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>  		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		/* Store pool base value to calculate indices for index-based
> +		 * lcore cache implementation
> +		 */
> +		if (i == 0)
> +			mp->pool_base_value = obj;
> +#endif
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>  		off += mp->elt_size + mp->trailer_size;
>  	}
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/1] mempool: implement index-based per core cache
  2022-01-11  2:26     ` Ananyev, Konstantin
@ 2022-01-13  5:17       ` Dharmik Thakkar
  2022-01-13 10:37         ` Ananyev, Konstantin
  0 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2022-01-13  5:17 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Honnappa Nagarahalli,
	Ruifeng Wang

Hi Konstatin,

Thank you for your comments and the test report!

> On Jan 10, 2022, at 8:26 PM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> 
> 
> 
>> Current mempool per core cache implementation stores pointers to mbufs
>> On 64b architectures, each pointer consumes 8B
>> This patch replaces it with index-based implementation,
>> where in each buffer is addressed by (pool base address + index)
>> It reduces the amount of memory/cache required for per core cache
>> 
>> L3Fwd performance testing reveals minor improvements in the cache
>> performance (L1 and L2 misses reduced by 0.60%)
>> with no change in throughput
> 
> I feel really sceptical about that patch and the whole idea in general:
> - From what I read above there is no real performance improvement observed.
>  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>  see below for more details). 

Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
Similar optimizations can be implemented for x86 platforms which should close the performance gap
and in my understanding should give better performance for a bulk size of 32.

> - Space utilization difference looks neglectable too.

Sorry, I did not understand this point.

> - The change introduces a new build time config option with a major limitation:
>   All memzones in a pool have to be within the same 4GB boundary. 
>   To address it properly, extra changes will be required in init(/populate) part of the code.

I agree to the above mentioned challenges and I am currently working on resolving these issues.

>   All that will complicate mempool code, will make it more error prone
>   and harder to maintain.
> But, as there is no real gain in return - no point to add such extra complexity at all.
> 
> Konstantin
> 
> CSX 2.1 GHz
> ==========
> 
> echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci
> 
> params :                                                                                                  rate_persec  	
>                                                                                                                 (normal/index-based/diff %)
> (with cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65
> 
> (with user-owned cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92
> 
>> 
>> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>> lib/mempool/rte_mempool.h             | 114 +++++++++++++++++++++++++-
>> lib/mempool/rte_mempool_ops_default.c |   7 ++
>> 2 files changed, 119 insertions(+), 2 deletions(-)
>> 
>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>> index 1e7a3c15273c..4fabd3b1920b 100644
>> --- a/lib/mempool/rte_mempool.h
>> +++ b/lib/mempool/rte_mempool.h
>> @@ -50,6 +50,10 @@
>> #include <rte_memcpy.h>
>> #include <rte_common.h>
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#include <rte_vect.h>
>> +#endif
>> +
>> #include "rte_mempool_trace_fp.h"
>> 
>> #ifdef __cplusplus
>> @@ -239,6 +243,9 @@ struct rte_mempool {
>> 	int32_t ops_index;
>> 
>> 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	void *pool_base_value; /**< Base value to calculate indices */
>> +#endif
>> 
>> 	uint32_t populated_size;         /**< Number of populated objects. */
>> 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
>> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>> 	if (cache == NULL || cache->len == 0)
>> 		return;
>> 	rte_mempool_trace_cache_flush(cache, mp);
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	unsigned int i;
>> +	unsigned int cache_len = cache->len;
>> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
>> +	void *base_value = mp->pool_base_value;
>> +	uint32_t *cache_objs = (uint32_t *) cache->objs;
>> +	for (i = 0; i < cache_len; i++)
>> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
>> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>> +#else
>> 	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
>> +#endif
>> 	cache->len = 0;
>> }
>> 
>> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
>> rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 			   unsigned int n, struct rte_mempool_cache *cache)
>> {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	uint32_t *cache_objs;
>> +	void *base_value;
>> +	uint32_t i;
>> +#else
>> 	void **cache_objs;
>> -
>> +#endif
>> 	/* increment stat now, adding in mempool always success */
>> 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>> 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
>> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
>> 		goto ring_enqueue;
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	cache_objs = (uint32_t *) cache->objs;
>> +	cache_objs = &cache_objs[cache->len];
>> +	base_value = mp->pool_base_value;
>> +#else
>> 	cache_objs = &cache->objs[cache->len];
>> +#endif
>> 
>> 	/*
>> 	 * The cache follows the following algorithm
>> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> 	 */
>> 
>> 	/* Add elements back into the cache */
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> +	uint64x2_t v_obj_table;
>> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +	uint32x2_t v_cache_objs;
>> +
>> +	for (i = 0; i < (n & ~0x1); i += 2) {
>> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
>> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
>> +		vst1_u32(cache_objs + i, v_cache_objs);
>> +	}
>> +	if (n & 0x1) {
>> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> +	}
>> +#else
>> +	for (i = 0; i < n; i++) {
>> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> +	}
>> +#endif
>> +#else
>> 	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
>> +#endif
>> 
>> 	cache->len += n;
>> 
>> 	if (cache->len >= cache->flushthresh) {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>> +				cache->len - cache->size);
>> +#else
>> 		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
>> 				cache->len - cache->size);
>> +#endif
>> 		cache->len = cache->size;
>> 	}
>> 
>> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> {
>> 	int ret;
>> 	uint32_t index, len;
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	uint32_t i;
>> +	uint32_t *cache_objs;
>> +#else
>> 	void **cache_objs;
>> -
>> +#endif
>> 	/* No cache provided or cannot be satisfied from cache */
>> 	if (unlikely(cache == NULL || n >= cache->size))
>> 		goto ring_dequeue;
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +	void *base_value = mp->pool_base_value;
>> +	cache_objs = (uint32_t *) cache->objs;
>> +#else
>> 	cache_objs = cache->objs;
>> +#endif
>> 
>> 	/* Can this be satisfied from the cache? */
>> 	if (cache->len < n) {
>> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> 		uint32_t req = n + (cache->size - cache->len);
>> 
>> 		/* How many do we require i.e. number to fill the cache + the request */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>> +		ret = rte_mempool_ops_dequeue_bulk(mp,
>> +			temp_objs, req);
>> +#else
>> 		ret = rte_mempool_ops_dequeue_bulk(mp,
>> 			&cache->objs[cache->len], req);
>> +#endif
>> 		if (unlikely(ret < 0)) {
>> 			/*
>> 			 * In the off chance that we are buffer constrained,
>> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> 			goto ring_dequeue;
>> 		}
>> 
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		len = cache->len;
>> +		for (i = 0; i < req; ++i, ++len) {
>> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
>> +								base_value);
>> +		}
>> +#endif
>> 		cache->len += req;
>> 	}
>> 
>> 	/* Now fill in the response ... */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> +	uint64x2_t v_obj_table;
>> +	uint64x2_t v_cache_objs;
>> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +
>> +	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
>> +						len -= 4, obj_table += 4) {
>> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
>> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
>> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
>> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> +		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
>> +	}
>> +	switch (n & 0x3) {
>> +	case 3:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +								/* fallthrough */
>> +	case 2:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +								/* fallthrough */
>> +	case 1:
>> +		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> +	}
>> +#else
>> +	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
>> +#endif
>> +#else
>> 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> 		*obj_table = cache_objs[len];
>> +#endif
>> 
>> 	cache->len -= n;
>> 
>> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
>> index 22fccf9d7619..3543cad9d4ce 100644
>> --- a/lib/mempool/rte_mempool_ops_default.c
>> +++ b/lib/mempool/rte_mempool_ops_default.c
>> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>> 		obj = va + off;
>> 		obj_cb(mp, obj_cb_arg, obj,
>> 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +		/* Store pool base value to calculate indices for index-based
>> +		 * lcore cache implementation
>> +		 */
>> +		if (i == 0)
>> +			mp->pool_base_value = obj;
>> +#endif
>> 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>> 		off += mp->elt_size + mp->trailer_size;
>> 	}
>> --
>> 2.25.1
> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 0/1] mempool: implement index-based per core cache
  2022-01-10  6:38             ` Jerin Jacob
@ 2022-01-13  5:31               ` Dharmik Thakkar
  2023-07-06 17:43                 ` Stephen Hemminger
  0 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2022-01-13  5:31 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Morten Brørup, Bruce Richardson, Honnappa Nagarahalli,
	dpdk-dev, nd, Ruifeng Wang

Hi,

Thank you for your valuable review comments and suggestions!

I will be sending out a v2 in which I have increased the size of the mempool to 32GB by using division by sizeof(uintptr_t).
However, I am seeing ~5% performance degradation with mempool_perf_autotest (for bulk size of 32) with this change
when compared to the base performance.
Earlier, without this change, I was seeing an improvement of ~13% compared to base performance. So, this is a significant degradation.
I would appreciate your review comments on v2.

Thank you!

> On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com> wrote:
> 
> On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>> 
>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
>>> Sent: Friday, 7 January 2022 14.51
>>> 
>>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
>>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
>>>>> Sent: Friday, 7 January 2022 12.16
>>>>> 
>>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
>>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
>>>>> Friday, 24
>>>>>>> December 2021 23.59
>>>>>>> 
>>>>>>> Current mempool per core cache implementation stores pointers
>>> to
>>>>> mbufs
>>>>>>> On 64b architectures, each pointer consumes 8B This patch
>>> replaces
>>>>> it
>>>>>>> with index-based implementation, where in each buffer is
>>> addressed
>>>>> by
>>>>>>> (pool base address + index) It reduces the amount of
>>> memory/cache
>>>>>>> required for per core cache
>>>>>>> 
>>>>>>> L3Fwd performance testing reveals minor improvements in the
>>> cache
>>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no change
>>> in
>>>>>>> throughput
>>>>>>> 
>>>>>>> Micro-benchmarking the patch using mempool_perf_test shows
>>>>> significant
>>>>>>> improvement with majority of the test cases
>>>>>>> 
>>>>>> 
>>>>>> I still think this is very interesting. And your performance
>>> numbers
>>>>> are
>>>>>> looking good.
>>>>>> 
>>>>>> However, it limits the size of a mempool to 4 GB. As previously
>>>>>> discussed, the max mempool size can be increased by multiplying
>>> the
>>>>> index
>>>>>> with a constant.
>>>>>> 
>>>>>> I would suggest using sizeof(uintptr_t) as the constant
>>> multiplier,
>>>>> so
>>>>>> the mempool can hold objects of any size divisible by
>>>>> sizeof(uintptr_t).
>>>>>> And it would be silly to use a mempool to hold objects smaller
>>> than
>>>>>> sizeof(uintptr_t).
>>>>>> 
>>>>>> How does the performance look if you multiply the index by
>>>>>> sizeof(uintptr_t)?
>>>>>> 
>>>>> 
>>>>> Each mempool entry is cache aligned, so we can use that if we want
>>> a
>>>>> bigger
>>>>> multiplier.
>>>> 
>>>> Thanks for chiming in, Bruce.
>>>> 
>>>> Please also read this discussion about the multiplier:
>>>> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
>>>> 
>>> 
>>> I actually wondered after I had sent the email whether we had indeed an
>>> option to disable the cache alignment or not! Thanks for pointing out
>>> that
>>> we do. This brings a couple additional thoughts:
>>> 
>>> * Using indexes for the cache should probably be a runtime flag rather
>>> than
>>>  a build-time one.
>>> * It would seem reasonable to me to disallow use of the indexed-cache
>>> flag
>>>  and the non-cache aligned flag simultaneously.
>>> * On the offchance that that restriction is unacceptable, then we can
>>>  make things a little more complicated by doing a runtime computation
>>> of
>>>  the "index-shiftwidth" to use.
>>> 
>>> Overall, I think defaulting to cacheline shiftwidth and disallowing
>>> index-based addressing when using unaligned buffers is simplest and
>>> easiest
>>> unless we can come up with a valid usecase for needing more than that.
>>> 
>>> /Bruce
>> 
>> This feature is a performance optimization.
>> 
>> With that in mind, it should not introduce function pointers or similar run-time checks or in the fast path, to determine what kind of cache to use per mempool. And if an index multiplier is implemented, it should be a compile time constant, probably something between sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
>> 
>> The patch comes with a tradeoff between better performance and limited mempool size, and possibly some limitations regarding very small objects that are not cache line aligned to avoid wasting memory (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
>> 
>> With no multiplier, the only tradeoff is that the mempool size is limited to 4 GB.
>> 
>> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the mempool size is limited to 32 GB. (And a waste of memory for objects smaller than 8 byte; but I don't think anyone would use a mempool to hold objects smaller than 8 byte.)
>> 
>> If the multiplier is larger (i.e. 64 bytes cache line size), the mempool size is instead limited to 256 GB, but RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
>> 
>> Note: 32 bit platforms have no benefit from this patch: The pointer already only uses 4 bytes, so replacing the pointer with a 4 byte index makes no difference.
>> 
>> 
>> Since this feature is a performance optimization only, and doesn't provide any new features, I don't mind it being a compile time option.
>> 
>> If this feature is a compile time option, and the mempool library is compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the public header file, so compilation of applications using the flag will fail. And rte_mempool_create() could RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags parameter, or emit a warning about the flag being ignored. Obviously, rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger than the library supports, possibly emitting a message that the mempool library should be built without this feature to support the larger mempool.
>> 
>> Here is another thought: If only exotic applications use mempools larger than 32 GB, this would be a generally acceptable limit, and DPDK should use index-based cache as default, making the opposite (i.e. pointer-based cache) a compile time option instead. A similar decision was recently made for limiting the RTE_MAX_LCORE default.
>> 
>> 
>> Although DPDK is moving away from compile time options in order to better support Linux distros, there should be a general exception for performance and memory optimizations. Otherwise, network appliance vendors will inherit the increasing amount of DPDK bloat, and we (network appliance vendors) will eventually be forced to fork DPDK to get rid of the bloat and achieve the goals originally intended by DPDK.
> 
> Agree with Morten's view on this.
> 
>> If anyone disagrees with the principle about a general exception for performance and memory optimizations, I would like to pass on the decision to the Techboard!
>> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH v2 0/1] mempool: implement index-based per core cache
  2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
  2021-12-24 22:59   ` [PATCH 1/1] " Dharmik Thakkar
  2021-12-25  0:16   ` [PATCH 0/1] " Morten Brørup
@ 2022-01-13  5:36   ` Dharmik Thakkar
  2022-01-13  5:36     ` [PATCH v2 1/1] " Dharmik Thakkar
  2 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2022-01-13  5:36 UTC (permalink / raw)
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang, Dharmik Thakkar

Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache

L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput

Micro-benchmarking the patch using mempool_perf_test shows
significant improvement with majority of the test cases

Number of cores = 1:
n_get_bulk=1 n_put_bulk=1 n_keep=32 %_change_with_patch=18.01
n_get_bulk=1 n_put_bulk=1 n_keep=128 %_change_with_patch=19.91
n_get_bulk=1 n_put_bulk=4 n_keep=32 %_change_with_patch=-20.37 (regression)
n_get_bulk=1 n_put_bulk=4 n_keep=128 %_change_with_patch=-17.01 (regression) 
n_get_bulk=1 n_put_bulk=32 n_keep=32 %_change_with_patch=-25.06 (regression)
n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.81 (regression)
n_get_bulk=4 n_put_bulk=1 n_keep=32 %_change_with_patch=53.93
n_get_bulk=4 n_put_bulk=1 n_keep=128 %_change_with_patch=60.90
n_get_bulk=4 n_put_bulk=4 n_keep=32 %_change_with_patch=1.64
n_get_bulk=4 n_put_bulk=4 n_keep=128 %_change_with_patch=8.76
n_get_bulk=4 n_put_bulk=32 n_keep=32 %_change_with_patch=-4.71 (regression)
n_get_bulk=4 n_put_bulk=32 n_keep=128 %_change_with_patch=-3.19 (regression)
n_get_bulk=32 n_put_bulk=1 n_keep=32 %_change_with_patch=65.63
n_get_bulk=32 n_put_bulk=1 n_keep=128 %_change_with_patch=75.19
n_get_bulk=32 n_put_bulk=4 n_keep=32 %_change_with_patch=11.75
n_get_bulk=32 n_put_bulk=4 n_keep=128 %_change_with_patch=15.52
n_get_bulk=32 n_put_bulk=32 n_keep=32 %_change_with_patch=13.45
n_get_bulk=32 n_put_bulk=32 n_keep=128 %_change_with_patch=11.58

Number of core = 2:
n_get_bulk=1 n_put_bulk=1 n_keep=32 %_change_with_patch=18.21
n_get_bulk=1 n_put_bulk=1 n_keep=128 %_change_with_patch=21.89
n_get_bulk=1 n_put_bulk=4 n_keep=32 %_change_with_patch=-21.21 (regression)
n_get_bulk=1 n_put_bulk=4 n_keep=128 %_change_with_patch=-17.05 (regression)
n_get_bulk=1 n_put_bulk=32 n_keep=32 %_change_with_patch=-26.09 (regression)
n_get_bulk=1 n_put_bulk=32 n_keep=128 %_change_with_patch=-23.49 (regression)
n_get_bulk=4 n_put_bulk=1 n_keep=32 %_change_with_patch=56.28
n_get_bulk=4 n_put_bulk=1 n_keep=128 %_change_with_patch=67.69
n_get_bulk=4 n_put_bulk=4 n_keep=32 %_change_with_patch=1.45
n_get_bulk=4 n_put_bulk=4 n_keep=128 %_change_with_patch=8.84
n_get_bulk=4 n_put_bulk=32 n_keep=32 %_change_with_patch=-5.27 (regression)
n_get_bulk=4 n_put_bulk=32 n_keep=128 %_change_with_patch=-3.09 (regression)
n_get_bulk=32 n_put_bulk=1 n_keep=32 %_change_with_patch=76.11
n_get_bulk=32 n_put_bulk=1 n_keep=128 %_change_with_patch=86.06
n_get_bulk=32 n_put_bulk=4 n_keep=32 %_change_with_patch=11.86
n_get_bulk=32 n_put_bulk=4 n_keep=128 %_change_with_patch=16.55
n_get_bulk=32 n_put_bulk=32 n_keep=32 %_change_with_patch=13.01
n_get_bulk=32 n_put_bulk=32 n_keep=128 %_change_with_patch=11.51


From analyzing the results, it is clear that for n_get_bulk and
n_put_bulk sizes of 32 there is no performance regression
IMO, the other sizes are not practical from performance perspective
and the regression in those cases can be safely ignored

An attempt to increase the size of mempool to 32GB, by dividing the
index by sizeof(uintptr_t), has led to a performance degradation of
~5% compared to the base performance
---
v2:
 - Increase size of mempool to 32GB (Morten)
 - Improve performance for other platforms using dual loop unrolling
---
Dharmik Thakkar (1):
  mempool: implement index-based per core cache

 lib/mempool/rte_mempool.h             | 150 +++++++++++++++++++++++++-
 lib/mempool/rte_mempool_ops_default.c |   7 ++
 2 files changed, 156 insertions(+), 1 deletion(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-13  5:36   ` [PATCH v2 " Dharmik Thakkar
@ 2022-01-13  5:36     ` Dharmik Thakkar
  2022-01-13 10:18       ` Jerin Jacob
                         ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Dharmik Thakkar @ 2022-01-13  5:36 UTC (permalink / raw)
  To: Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang, Dharmik Thakkar

Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache

L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/mempool/rte_mempool.h             | 150 +++++++++++++++++++++++++-
 lib/mempool/rte_mempool_ops_default.c |   7 ++
 2 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1e7a3c15273c..f2403fbc97a7 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -50,6 +50,10 @@
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#include <rte_vect.h>
+#endif
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -239,6 +243,9 @@ struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	void *pool_base_value; /**< Base value to calculate indices */
+#endif
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1314,7 +1321,22 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = mp->pool_base_value;
+	uint32_t *cache_objs = (uint32_t *) cache->objs;
+	for (i = 0; i < cache_len; i++) {
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		cache_objs[i] = cache_objs[i] << 3;
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	}
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
+#else
 	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+#endif
 	cache->len = 0;
 }
 
@@ -1334,7 +1356,14 @@ static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 			   unsigned int n, struct rte_mempool_cache *cache)
 {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
+	uint32_t temp_objs[2];
+#else
 	void **cache_objs;
+#endif
 
 	/* increment stat now, adding in mempool always success */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
@@ -1344,7 +1373,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
 		goto ring_enqueue;
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	cache_objs = (uint32_t *) cache->objs;
+	cache_objs = &cache_objs[cache->len];
+	base_value = mp->pool_base_value;
+#else
 	cache_objs = &cache->objs[cache->len];
+#endif
 
 	/*
 	 * The cache follows the following algorithm
@@ -1354,13 +1389,50 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
+	for (i = 0; i < (n & ~0x1); i += 2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		v_cache_objs = vshr_n_u32(v_cache_objs, 3);
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+#else
+	for (i = 0; i < (n & ~0x1); i += 2) {
+		temp_objs[0] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+		temp_objs[1] = (uint32_t) RTE_PTR_DIFF(obj_table[i + 1], base_value);
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		cache_objs[i] = temp_objs[0] >> 3;
+		cache_objs[i + 1] = temp_objs[1] >> 3;
+	}
+#endif
+	if (n & 0x1) {
+		temp_objs[0] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+
+		/* Divide by sizeof(uintptr_t) to accommodate 16G/32G mempool */
+		cache_objs[i] = temp_objs[0] >> 3;
+	}
+#else
 	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
+				cache->len - cache->size);
+#else
 		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
 				cache->len - cache->size);
+#endif
 		cache->len = cache->size;
 	}
 
@@ -1461,13 +1533,23 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	uint32_t index, len;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	uint32_t i;
+	uint32_t *cache_objs;
+	uint32_t objs[2];
+#else
 	void **cache_objs;
-
+#endif
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+	void *base_value = mp->pool_base_value;
+	cache_objs = (uint32_t *) cache->objs;
+#else
 	cache_objs = cache->objs;
+#endif
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
@@ -1475,8 +1557,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache->size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			temp_objs, req);
+#else
 		ret = rte_mempool_ops_dequeue_bulk(mp,
 			&cache->objs[cache->len], req);
+#endif
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1487,12 +1575,72 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
+								base_value);
+			/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+			cache_objs[len] = cache_objs[len] >> 3;
+		}
+#endif
+
 		cache->len += req;
 	}
 
 	/* Now fill in the response ... */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
+	for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
+						len -= 4, obj_table += 4) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		v_cache_objs = vshlq_n_u64(v_cache_objs, 3);
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		v_cache_objs = vshlq_n_u64(v_cache_objs, 3);
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
+	}
+	switch (n & 0x3) {
+	case 3:
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		objs[0] = cache_objs[len--] << 3;
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, objs[0]); /* fallthrough */
+	case 2:
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		objs[0] = cache_objs[len--] << 3;
+		*(obj_table++) = (void *) RTE_PTR_ADD(base_value, objs[0]); /* fallthrough */
+	case 1:
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		objs[0] = cache_objs[len] << 3;
+		*(obj_table) = (void *) RTE_PTR_ADD(base_value, objs[0]);
+	}
+#else
+	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index += 2,
+						len -= 2, obj_table += 2) {
+		/* Scale by sizeof(uintptr_t) to accommodate 16GB/32GB mempool */
+		objs[0] = cache_objs[len] << 3;
+		objs[1] = cache_objs[len - 1] << 3;
+		*obj_table = (void *) RTE_PTR_ADD(base_value, objs[0]);
+		*(obj_table + 1) = (void *) RTE_PTR_ADD(base_value, objs[1]);
+	}
+
+	if (n & 0x1) {
+		objs[0] = cache_objs[len] << 3;
+		*obj_table = (void *) RTE_PTR_ADD(base_value, objs[0]);
+	}
+#endif
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
 		*obj_table = cache_objs[len];
+#endif
 
 	cache->len -= n;
 
diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
index 22fccf9d7619..3543cad9d4ce 100644
--- a/lib/mempool/rte_mempool_ops_default.c
+++ b/lib/mempool/rte_mempool_ops_default.c
@@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
 		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+		/* Store pool base value to calculate indices for index-based
+		 * lcore cache implementation
+		 */
+		if (i == 0)
+			mp->pool_base_value = obj;
+#endif
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 		off += mp->elt_size + mp->trailer_size;
 	}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-13  5:36     ` [PATCH v2 1/1] " Dharmik Thakkar
@ 2022-01-13 10:18       ` Jerin Jacob
  2022-01-20  8:21       ` Morten Brørup
  2022-01-23  7:13       ` Wang, Haiyue
  2 siblings, 0 replies; 52+ messages in thread
From: Jerin Jacob @ 2022-01-13 10:18 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: Olivier Matz, Andrew Rybchenko, dpdk-dev, nd,
	Honnappa Nagarahalli, Ruifeng Wang (Arm Technology China)

On Thu, Jan 13, 2022 at 11:06 AM Dharmik Thakkar
<dharmik.thakkar@arm.com> wrote:
>
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
>
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput
>
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---

>
>         /* Now fill in the response ... */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE

Instead of having this #ifdef clutter everywhere for the pair,
I think, we can define RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE once,
and have a different implementation.
i.e
#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
void x()
{

}
void y()
{

}
#else

void x()
{

}
void y()
{

}

#endif

call
x();
y();

in the main code.

> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
> index 22fccf9d7619..3543cad9d4ce 100644
> --- a/lib/mempool/rte_mempool_ops_default.c
> +++ b/lib/mempool/rte_mempool_ops_default.c
> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>                 obj = va + off;
>                 obj_cb(mp, obj_cb_arg, obj,
>                        (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE

This is the only place used in C code.
Since we are going compile time approach. Can make this unconditional?
That will enable the use of this model in the application, without
recompiling DPDK.
All application needs to

#define RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE 1
#include <rte_mempool.h>

I believe enabling such structuring helps to avoid DPDK recompilation of code.


> +               /* Store pool base value to calculate indices for index-based
> +                * lcore cache implementation
> +                */
> +               if (i == 0)
> +                       mp->pool_base_value = obj;
> +#endif
>                 rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>                 off += mp->elt_size + mp->trailer_size;
>         }
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 1/1] mempool: implement index-based per core cache
  2022-01-13  5:17       ` Dharmik Thakkar
@ 2022-01-13 10:37         ` Ananyev, Konstantin
  2022-01-19 15:32           ` Dharmik Thakkar
  0 siblings, 1 reply; 52+ messages in thread
From: Ananyev, Konstantin @ 2022-01-13 10:37 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Honnappa Nagarahalli,
	Ruifeng Wang


Hi Dharmik,

> >
> >> Current mempool per core cache implementation stores pointers to mbufs
> >> On 64b architectures, each pointer consumes 8B
> >> This patch replaces it with index-based implementation,
> >> where in each buffer is addressed by (pool base address + index)
> >> It reduces the amount of memory/cache required for per core cache
> >>
> >> L3Fwd performance testing reveals minor improvements in the cache
> >> performance (L1 and L2 misses reduced by 0.60%)
> >> with no change in throughput
> >
> > I feel really sceptical about that patch and the whole idea in general:
> > - From what I read above there is no real performance improvement observed.
> >  (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> >  see below for more details).
> 
> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> and in my understanding should give better performance for a bulk size of 32.

Might be, but I still don't see the reason for such effort.
As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
on ARM64 even with vectorized version of the code.

> > - Space utilization difference looks neglectable too.
> 
> Sorry, I did not understand this point.

As I understand one of the expectations from that patch was:
reduce memory/cache required, which should improve cache utilization
(less misses, etc.).
Though I think such improvements would be neglectable and wouldn't
cause any real performance gain. 

> > - The change introduces a new build time config option with a major limitation:
> >   All memzones in a pool have to be within the same 4GB boundary.
> >   To address it properly, extra changes will be required in init(/populate) part of the code.
> 
> I agree to the above mentioned challenges and I am currently working on resolving these issues.

I still think that to justify such changes some really noticeable performance
improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...  
Otherwise it just not worth the hassle. 
 
> >   All that will complicate mempool code, will make it more error prone
> >   and harder to maintain.
> > But, as there is no real gain in return - no point to add such extra complexity at all.
> >
> > Konstantin
> >

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/1] mempool: implement index-based per core cache
  2022-01-13 10:37         ` Ananyev, Konstantin
@ 2022-01-19 15:32           ` Dharmik Thakkar
  2022-01-21 11:25             ` Ananyev, Konstantin
  0 siblings, 1 reply; 52+ messages in thread
From: Dharmik Thakkar @ 2022-01-19 15:32 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Honnappa Nagarahalli,
	Ruifeng Wang

Hi Konstatin,

> On Jan 13, 2022, at 4:37 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> 
> Hi Dharmik,
> 
>>> 
>>>> Current mempool per core cache implementation stores pointers to mbufs
>>>> On 64b architectures, each pointer consumes 8B
>>>> This patch replaces it with index-based implementation,
>>>> where in each buffer is addressed by (pool base address + index)
>>>> It reduces the amount of memory/cache required for per core cache
>>>> 
>>>> L3Fwd performance testing reveals minor improvements in the cache
>>>> performance (L1 and L2 misses reduced by 0.60%)
>>>> with no change in throughput
>>> 
>>> I feel really sceptical about that patch and the whole idea in general:
>>> - From what I read above there is no real performance improvement observed.
>>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>>> see below for more details).
>> 
>> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
>> Similar optimizations can be implemented for x86 platforms which should close the performance gap
>> and in my understanding should give better performance for a bulk size of 32.
> 
> Might be, but I still don't see the reason for such effort.
> As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> on ARM64 even with vectorized version of the code.
> 

IMO, even without performance improvement, it is advantageous because the same performance is being achieved
with less memory and cache utilization using the patch.

>>> - Space utilization difference looks neglectable too.
>> 
>> Sorry, I did not understand this point.
> 
> As I understand one of the expectations from that patch was:
> reduce memory/cache required, which should improve cache utilization
> (less misses, etc.).
> Though I think such improvements would be neglectable and wouldn't
> cause any real performance gain.

The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.

> 
>>> - The change introduces a new build time config option with a major limitation:
>>>  All memzones in a pool have to be within the same 4GB boundary.
>>>  To address it properly, extra changes will be required in init(/populate) part of the code.
>> 
>> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> 
> I still think that to justify such changes some really noticeable performance
> improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...  
> Otherwise it just not worth the hassle. 
> 

Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.

>>>  All that will complicate mempool code, will make it more error prone
>>>  and harder to maintain.
>>> But, as there is no real gain in return - no point to add such extra complexity at all.
>>> 
>>> Konstantin
>>> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-13  5:36     ` [PATCH v2 1/1] " Dharmik Thakkar
  2022-01-13 10:18       ` Jerin Jacob
@ 2022-01-20  8:21       ` Morten Brørup
  2022-01-21  6:01         ` Honnappa Nagarahalli
  2022-01-23  7:13       ` Wang, Haiyue
  2 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2022-01-20  8:21 UTC (permalink / raw)
  To: Dharmik Thakkar, honnappa.nagarahalli, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, ruifeng.wang, Beilei Xing

+CC Beilei as i40e maintainer

> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
> Sent: Thursday, 13 January 2022 06.37
> 
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/mempool/rte_mempool.h             | 150 +++++++++++++++++++++++++-
>  lib/mempool/rte_mempool_ops_default.c |   7 ++
>  2 files changed, 156 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1e7a3c15273c..f2403fbc97a7 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -50,6 +50,10 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
> 
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#include <rte_vect.h>
> +#endif
> +
>  #include "rte_mempool_trace_fp.h"
> 
>  #ifdef __cplusplus
> @@ -239,6 +243,9 @@ struct rte_mempool {
>  	int32_t ops_index;
> 
>  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache
> */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	void *pool_base_value; /**< Base value to calculate indices */
> +#endif
> 
>  	uint32_t populated_size;         /**< Number of populated
> objects. */
>  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> pool */
> @@ -1314,7 +1321,22 @@ rte_mempool_cache_flush(struct rte_mempool_cache
> *cache,
>  	if (cache == NULL || cache->len == 0)
>  		return;
>  	rte_mempool_trace_cache_flush(cache, mp);
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +	unsigned int i;
> +	unsigned int cache_len = cache->len;
> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +	void *base_value = mp->pool_base_value;
> +	uint32_t *cache_objs = (uint32_t *) cache->objs;

Hi Dharmik and Honnappa,

The essence of this patch is based on recasting the type of the objs field in the rte_mempool_cache structure from an array of pointers to an array of uint32_t.

However, this effectively breaks the ABI, because the rte_mempool_cache structure is public and part of the API.

Some drivers [1] even bypass the mempool API and access the rte_mempool_cache structure directly, assuming that the objs array in the cache is an array of pointers. So you cannot recast the fields in the rte_mempool_cache structure the way this patch requires.

Although I do consider bypassing an API's accessor functions "spaghetti code", this driver's behavior is formally acceptable as long as the rte_mempool_cache structure is not marked as internal.

I really liked your idea of using indexes instead of pointers, so I'm very sorry to shoot it down. :-(

[1]: E.g. the Intel i40e PMD, http://code.dpdk.org/dpdk/latest/source/drivers/net/i40e/i40e_rxtx_vec_avx512.c#L25

-Morten


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-20  8:21       ` Morten Brørup
@ 2022-01-21  6:01         ` Honnappa Nagarahalli
  2022-01-21  7:36           ` Morten Brørup
  2022-01-21  9:12           ` Bruce Richardson
  0 siblings, 2 replies; 52+ messages in thread
From: Honnappa Nagarahalli @ 2022-01-21  6:01 UTC (permalink / raw)
  To: Morten Brørup, Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, Ruifeng Wang, Beilei Xing, nd


> 
> +CC Beilei as i40e maintainer
> 
> > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
> > Sent: Thursday, 13 January 2022 06.37
> >
> > Current mempool per core cache implementation stores pointers to mbufs
> > On 64b architectures, each pointer consumes 8B This patch replaces it
> > with index-based implementation, where in each buffer is addressed by
> > (pool base address + index) It reduces the amount of memory/cache
> > required for per core cache
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance (L1 and L2 misses reduced by 0.60%) with no change in
> > throughput
> >
> > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > ---
> >  lib/mempool/rte_mempool.h             | 150 +++++++++++++++++++++++++-
> >  lib/mempool/rte_mempool_ops_default.c |   7 ++
> >  2 files changed, 156 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 1e7a3c15273c..f2403fbc97a7 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -50,6 +50,10 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> > +#include <rte_vect.h>
> > +#endif
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -239,6 +243,9 @@ struct rte_mempool {
> >  	int32_t ops_index;
> >
> >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> > +	void *pool_base_value; /**< Base value to calculate indices */
> > +#endif
> >
> >  	uint32_t populated_size;         /**< Number of populated
> > objects. */
> >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > pool */ @@ -1314,7 +1321,22 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> >  	if (cache == NULL || cache->len == 0)
> >  		return;
> >  	rte_mempool_trace_cache_flush(cache, mp);
> > +
> > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> > +	unsigned int i;
> > +	unsigned int cache_len = cache->len;
> > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +	void *base_value = mp->pool_base_value;
> > +	uint32_t *cache_objs = (uint32_t *) cache->objs;
> 
> Hi Dharmik and Honnappa,
> 
> The essence of this patch is based on recasting the type of the objs field in the
> rte_mempool_cache structure from an array of pointers to an array of
> uint32_t.
> 
> However, this effectively breaks the ABI, because the rte_mempool_cache
> structure is public and part of the API.
The patch does not change the public structure, the new member is under compile time flag, not sure how it breaks the ABI.

> 
> Some drivers [1] even bypass the mempool API and access the
> rte_mempool_cache structure directly, assuming that the objs array in the
> cache is an array of pointers. So you cannot recast the fields in the
> rte_mempool_cache structure the way this patch requires.
IMO, those drivers are at fault. The mempool cache structure is public only because the APIs are inline. We should still maintain modularity and not use the members of structures belonging to another library directly. A similar effort involving rte_ring was not accepted sometime back [1]

[1] http://inbox.dpdk.org/dev/DBAPR08MB5814907968595EE56F5E20A798390@DBAPR08MB5814.eurprd08.prod.outlook.com/

> 
> Although I do consider bypassing an API's accessor functions "spaghetti
> code", this driver's behavior is formally acceptable as long as the
> rte_mempool_cache structure is not marked as internal.
> 
> I really liked your idea of using indexes instead of pointers, so I'm very sorry to
> shoot it down. :-(
> 
> [1]: E.g. the Intel i40e PMD,
> http://code.dpdk.org/dpdk/latest/source/drivers/net/i40e/i40e_rxtx_vec_avx
> 512.c#L25
It is possible to throw an error when this feature is enabled in this file. Alternatively, this PMD could implement the code for index based mempool.

> 
> -Morten


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-21  6:01         ` Honnappa Nagarahalli
@ 2022-01-21  7:36           ` Morten Brørup
  2022-01-24 13:05             ` Ray Kinsella
  2022-01-21  9:12           ` Bruce Richardson
  1 sibling, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2022-01-21  7:36 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Dharmik Thakkar, Olivier Matz,
	Andrew Rybchenko, Ray Kinsella
  Cc: dev, nd, Ruifeng Wang, Beilei Xing, nd

+Ray Kinsella, ABI Policy maintainer

> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Friday, 21 January 2022 07.01
> 
> >
> > +CC Beilei as i40e maintainer
> >
> > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
> > > Sent: Thursday, 13 January 2022 06.37
> > >
> > > Current mempool per core cache implementation stores pointers to
> mbufs
> > > On 64b architectures, each pointer consumes 8B This patch replaces
> it
> > > with index-based implementation, where in each buffer is addressed
> by
> > > (pool base address + index) It reduces the amount of memory/cache
> > > required for per core cache
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance (L1 and L2 misses reduced by 0.60%) with no change in
> > > throughput
> > >
> > > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > ---
> > >  lib/mempool/rte_mempool.h             | 150
> +++++++++++++++++++++++++-
> > >  lib/mempool/rte_mempool_ops_default.c |   7 ++
> > >  2 files changed, 156 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 1e7a3c15273c..f2403fbc97a7 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -50,6 +50,10 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> > > +#include <rte_vect.h>
> > > +#endif
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -239,6 +243,9 @@ struct rte_mempool {
> > >  	int32_t ops_index;
> > >
> > >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache
> */
> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> > > +	void *pool_base_value; /**< Base value to calculate indices */
> > > +#endif
> > >
> > >  	uint32_t populated_size;         /**< Number of populated
> > > objects. */
> > >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > > pool */ @@ -1314,7 +1321,22 @@ rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > >  	if (cache == NULL || cache->len == 0)
> > >  		return;
> > >  	rte_mempool_trace_cache_flush(cache, mp);
> > > +
> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> > > +	unsigned int i;
> > > +	unsigned int cache_len = cache->len;
> > > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +	void *base_value = mp->pool_base_value;
> > > +	uint32_t *cache_objs = (uint32_t *) cache->objs;
> >
> > Hi Dharmik and Honnappa,
> >
> > The essence of this patch is based on recasting the type of the objs
> field in the
> > rte_mempool_cache structure from an array of pointers to an array of
> > uint32_t.
> >
> > However, this effectively breaks the ABI, because the
> rte_mempool_cache
> > structure is public and part of the API.
> The patch does not change the public structure, the new member is under
> compile time flag, not sure how it breaks the ABI.
> 
> >
> > Some drivers [1] even bypass the mempool API and access the
> > rte_mempool_cache structure directly, assuming that the objs array in
> the
> > cache is an array of pointers. So you cannot recast the fields in the
> > rte_mempool_cache structure the way this patch requires.
> IMO, those drivers are at fault. The mempool cache structure is public
> only because the APIs are inline. We should still maintain modularity
> and not use the members of structures belonging to another library
> directly. A similar effort involving rte_ring was not accepted sometime
> back [1]
> 
> [1]
> http://inbox.dpdk.org/dev/DBAPR08MB5814907968595EE56F5E20A798390@DBAPR0
> 8MB5814.eurprd08.prod.outlook.com/
> 
> >
> > Although I do consider bypassing an API's accessor functions
> "spaghetti
> > code", this driver's behavior is formally acceptable as long as the
> > rte_mempool_cache structure is not marked as internal.
> >
> > I really liked your idea of using indexes instead of pointers, so I'm
> very sorry to
> > shoot it down. :-(
> >
> > [1]: E.g. the Intel i40e PMD,
> >
> http://code.dpdk.org/dpdk/latest/source/drivers/net/i40e/i40e_rxtx_vec_
> avx
> > 512.c#L25
> It is possible to throw an error when this feature is enabled in this
> file. Alternatively, this PMD could implement the code for index based
> mempool.
> 

I agree with both your points, Honnappa.

The ABI remains intact, and only changes when this feature is enabled at compile time.

In addition to your suggestions, I propose that the patch modifies the objs type in the mempool cache structure itself, instead of type casting it through an access variable. This should throw an error when compiling an application that accesses it as a pointer array instead of a uint32_t array - like the affected Intel PMDs.

The updated objs field in the mempool cache structure should have the same size when compiled as the original objs field, so this feature doesn't change anything else in the ABI, only the type of the mempool cache objects.

Also, the description of the feature should stress that applications accessing the cache objects directly will fail miserably.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-21  6:01         ` Honnappa Nagarahalli
  2022-01-21  7:36           ` Morten Brørup
@ 2022-01-21  9:12           ` Bruce Richardson
  1 sibling, 0 replies; 52+ messages in thread
From: Bruce Richardson @ 2022-01-21  9:12 UTC (permalink / raw)
  To: Honnappa Nagarahalli
  Cc: Morten Brørup, Dharmik Thakkar, Olivier Matz,
	Andrew Rybchenko, dev, nd, Ruifeng Wang, Beilei Xing

On Fri, Jan 21, 2022 at 06:01:23AM +0000, Honnappa Nagarahalli wrote:
> 
> > 
> > +CC Beilei as i40e maintainer
> > 
> > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> > > Thursday, 13 January 2022 06.37
> > >
> > > Current mempool per core cache implementation stores pointers to
> > > mbufs On 64b architectures, each pointer consumes 8B This patch
> > > replaces it with index-based implementation, where in each buffer is
> > > addressed by (pool base address + index) It reduces the amount of
> > > memory/cache required for per core cache
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance (L1 and L2 misses reduced by 0.60%) with no change in
> > > throughput
> > >
> > > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com> Reviewed-by:
> > > Ruifeng Wang <ruifeng.wang@arm.com> --- lib/mempool/rte_mempool.h
> > > | 150 +++++++++++++++++++++++++-
> > > lib/mempool/rte_mempool_ops_default.c |   7 ++ 2 files changed, 156
> > > insertions(+), 1 deletion(-)
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 1e7a3c15273c..f2403fbc97a7 100644 ---
> > > a/lib/mempool/rte_mempool.h +++ b/lib/mempool/rte_mempool.h @@ -50,6
> > > +50,10 @@ #include <rte_memcpy.h> #include <rte_common.h>
> > >
> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE +#include <rte_vect.h>
> > > +#endif + #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus @@ -239,6 +243,9 @@ struct rte_mempool { int32_t
> > >  ops_index;
> > >
> > >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache
> > >  	*/ +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE +	void
> > >  	*pool_base_value; /**< Base value to calculate indices */ +#endif
> > >
> > >  	uint32_t populated_size;         /**< Number of populated objects.
> > >  	*/ struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > >  	pool */ @@ -1314,7 +1321,22 @@ rte_mempool_cache_flush(struct
> > >  	rte_mempool_cache *cache, if (cache == NULL || cache->len == 0)
> > >  	return; rte_mempool_trace_cache_flush(cache, mp); + +#ifdef
> > >  	RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE +	unsigned int i; +
> > >  	unsigned int cache_len = cache->len; +	void
> > >  	*obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; +	void *base_value =
> > >  	mp->pool_base_value; +	uint32_t *cache_objs = (uint32_t *)
> > >  	cache->objs;
> > 
> > Hi Dharmik and Honnappa,
> > 
> > The essence of this patch is based on recasting the type of the objs
> > field in the rte_mempool_cache structure from an array of pointers to
> > an array of uint32_t.
> > 
> > However, this effectively breaks the ABI, because the rte_mempool_cache
> > structure is public and part of the API.
> The patch does not change the public structure, the new member is under
> compile time flag, not sure how it breaks the ABI.
> 
> > 
> > Some drivers [1] even bypass the mempool API and access the
> > rte_mempool_cache structure directly, assuming that the objs array in
> > the cache is an array of pointers. So you cannot recast the fields in
> > the rte_mempool_cache structure the way this patch requires.
> IMO, those drivers are at fault. The mempool cache structure is public
> only because the APIs are inline. We should still maintain modularity and
> not use the members of structures belonging to another library directly.
> A similar effort involving rte_ring was not accepted sometime back [1]
> 
> [1]
> http://inbox.dpdk.org/dev/DBAPR08MB5814907968595EE56F5E20A798390@DBAPR08MB5814.eurprd08.prod.outlook.com/
> 
> > 
> > Although I do consider bypassing an API's accessor functions "spaghetti
> > code", this driver's behavior is formally acceptable as long as the
> > rte_mempool_cache structure is not marked as internal.
> > 
> > I really liked your idea of using indexes instead of pointers, so I'm
> > very sorry to shoot it down. :-(
> > 
> > [1]: E.g. the Intel i40e PMD,
> > http://code.dpdk.org/dpdk/latest/source/drivers/net/i40e/i40e_rxtx_vec_avx
> > 512.c#L25
> It is possible to throw an error when this feature is enabled in this
> file. Alternatively, this PMD could implement the code for index based
> mempool.
>
Yes, it can implement it, and if this model get put in mempool it probably
will [even if it's just a fallback to the mempool code in that case].

However, I would object to adding in this model in the library right now if it
cannot be proved to show some benefit in a realworld case. As I understand
it, the only benefit seen has been in unit test cases? I want to ensure
that for any perf improvements we put in that they have some real-world
applicabilty - the amoung of applicability will depend on the scope and
impact - and by the same token that we don't reject simplifications or
improvements on the basis that they *might* cause issues, if all perf data
fails to show any problem.

So for this patch, can we get some perf numbers for an app where it does
show the value of it? L3fwd is a very trivial app, and as such is usually
fairly reliable in showing perf benefits of optimizations if they exist.
Perhaps for this case, we need something with a bigger cache footprint
perhaps?

Regards,
/Bruce

^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 1/1] mempool: implement index-based per core cache
  2022-01-19 15:32           ` Dharmik Thakkar
@ 2022-01-21 11:25             ` Ananyev, Konstantin
  2022-01-21 11:31               ` Ananyev, Konstantin
  2022-03-24 19:51               ` Dharmik Thakkar
  0 siblings, 2 replies; 52+ messages in thread
From: Ananyev, Konstantin @ 2022-01-21 11:25 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Honnappa Nagarahalli,
	Ruifeng Wang



Hi Dharmik,
> >
> >>>
> >>>> Current mempool per core cache implementation stores pointers to mbufs
> >>>> On 64b architectures, each pointer consumes 8B
> >>>> This patch replaces it with index-based implementation,
> >>>> where in each buffer is addressed by (pool base address + index)
> >>>> It reduces the amount of memory/cache required for per core cache
> >>>>
> >>>> L3Fwd performance testing reveals minor improvements in the cache
> >>>> performance (L1 and L2 misses reduced by 0.60%)
> >>>> with no change in throughput
> >>>
> >>> I feel really sceptical about that patch and the whole idea in general:
> >>> - From what I read above there is no real performance improvement observed.
> >>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> >>> see below for more details).
> >>
> >> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> >> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> >> and in my understanding should give better performance for a bulk size of 32.
> >
> > Might be, but I still don't see the reason for such effort.
> > As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> > on ARM64 even with vectorized version of the code.
> >
> 
> IMO, even without performance improvement, it is advantageous because the same performance is being achieved
> with less memory and cache utilization using the patch.
> 
> >>> - Space utilization difference looks neglectable too.
> >>
> >> Sorry, I did not understand this point.
> >
> > As I understand one of the expectations from that patch was:
> > reduce memory/cache required, which should improve cache utilization
> > (less misses, etc.).
> > Though I think such improvements would be neglectable and wouldn't
> > cause any real performance gain.
> 
> The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
> Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.

It saves you 4 just bytes per mbuf.
Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
Let's do a simplistic estimation of  number of affected cache-lines l for l2fwd. 
For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:

                                                                       number of cache-lines accessed 
                                                                   cache with pointers / cache with indexes 
mempool_get:                                            (32*8)/64=4          /  (32*4)/64=2
RX (read HW desc):                                    (32*16)/64=8       /   (32*16)/64=8
RX (write mbuf fields, 1st cache line):    (32*64)/64=3       /   (32*64)/64=32
update mac addrs:                                     (32*64)/64=32     /   (32*64)/64=32   
TX (write HW desc):                                   (32*16)/64=8       /   (32*16)/64=8
free mbufs (read 2nd mbuf cache line): (32*64)/64=32    /   (32*64)/64=32   
mempool_put:                                            (32*8)/64=4        /    (32*4)/64=2
total:                                                             120                             116

So, if my calculations are correct, max estimated gain for cache utilization would be:
(120-116)*100/120=3.33% 
Note that numbers are for over-simplistic usage scenario.
In more realistic ones, when we have to touch more cache-lines per packet,
that difference would be even less noticeable.
So I really doubt we will see some noticeable improvements in terms of cache utilization
with that patch.

> >
> >>> - The change introduces a new build time config option with a major limitation:
> >>>  All memzones in a pool have to be within the same 4GB boundary.
> >>>  To address it properly, extra changes will be required in init(/populate) part of the code.
> >>
> >> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> >
> > I still think that to justify such changes some really noticeable performance
> > improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> > Otherwise it just not worth the hassle.
> >
> 
> Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
> That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.

As I said above, I don’t think we'll see any real advantage here.
But feel free to pick-up different app and prove me wrong.
After all we have plenty of sample apps that do provide enough
pressure on the cache: l3fwd-acl, ipsec-secgw.
Or you can even apply these patches from Sean:
https://patches.dpdk.org/project/dpdk/list/?series=20999
to run l3fwd with configurable routes.
That should help you to make it cache-bound.

> I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.

And for other we are seeing double digit degradation.
So far the whole idea doesn't look promising at all, at least to me.
Konstantin


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 1/1] mempool: implement index-based per core cache
  2022-01-21 11:25             ` Ananyev, Konstantin
@ 2022-01-21 11:31               ` Ananyev, Konstantin
  2022-03-24 19:51               ` Dharmik Thakkar
  1 sibling, 0 replies; 52+ messages in thread
From: Ananyev, Konstantin @ 2022-01-21 11:31 UTC (permalink / raw)
  To: Ananyev, Konstantin, Dharmik Thakkar
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Honnappa Nagarahalli,
	Ruifeng Wang



> 
> 
> Hi Dharmik,
> > >
> > >>>
> > >>>> Current mempool per core cache implementation stores pointers to mbufs
> > >>>> On 64b architectures, each pointer consumes 8B
> > >>>> This patch replaces it with index-based implementation,
> > >>>> where in each buffer is addressed by (pool base address + index)
> > >>>> It reduces the amount of memory/cache required for per core cache
> > >>>>
> > >>>> L3Fwd performance testing reveals minor improvements in the cache
> > >>>> performance (L1 and L2 misses reduced by 0.60%)
> > >>>> with no change in throughput
> > >>>
> > >>> I feel really sceptical about that patch and the whole idea in general:
> > >>> - From what I read above there is no real performance improvement observed.
> > >>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> > >>> see below for more details).
> > >>
> > >> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> > >> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> > >> and in my understanding should give better performance for a bulk size of 32.
> > >
> > > Might be, but I still don't see the reason for such effort.
> > > As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> > > on ARM64 even with vectorized version of the code.
> > >
> >
> > IMO, even without performance improvement, it is advantageous because the same performance is being achieved
> > with less memory and cache utilization using the patch.
> >
> > >>> - Space utilization difference looks neglectable too.
> > >>
> > >> Sorry, I did not understand this point.
> > >
> > > As I understand one of the expectations from that patch was:
> > > reduce memory/cache required, which should improve cache utilization
> > > (less misses, etc.).
> > > Though I think such improvements would be neglectable and wouldn't
> > > cause any real performance gain.
> >
> > The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
> > Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
> 
> It saves you 4 just bytes per mbuf.
> Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
> Let's do a simplistic estimation of  number of affected cache-lines l for l2fwd.
> For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
> 
>                                                                        number of cache-lines accessed
>                                                                    cache with pointers / cache with indexes
> mempool_get:                                            (32*8)/64=4          /  (32*4)/64=2
> RX (read HW desc):                                    (32*16)/64=8       /   (32*16)/64=8
> RX (write mbuf fields, 1st cache line):    (32*64)/64=3       /   (32*64)/64=32

Should be:
RX (write mbuf fields, 1st cache line):    (32*64)/64=32       /   (32*64)/64=32
off course

> update mac addrs:                                     (32*64)/64=32     /   (32*64)/64=32
> TX (write HW desc):                                   (32*16)/64=8       /   (32*16)/64=8
> free mbufs (read 2nd mbuf cache line): (32*64)/64=32    /   (32*64)/64=32
> mempool_put:                                            (32*8)/64=4        /    (32*4)/64=2
> total:                                                             120                             116
> 
> So, if my calculations are correct, max estimated gain for cache utilization would be:
> (120-116)*100/120=3.33%
> Note that numbers are for over-simplistic usage scenario.
> In more realistic ones, when we have to touch more cache-lines per packet,
> that difference would be even less noticeable.
> So I really doubt we will see some noticeable improvements in terms of cache utilization
> with that patch.
> 
> > >
> > >>> - The change introduces a new build time config option with a major limitation:
> > >>>  All memzones in a pool have to be within the same 4GB boundary.
> > >>>  To address it properly, extra changes will be required in init(/populate) part of the code.
> > >>
> > >> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> > >
> > > I still think that to justify such changes some really noticeable performance
> > > improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> > > Otherwise it just not worth the hassle.
> > >
> >
> > Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
> > That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
> 
> As I said above, I don’t think we'll see any real advantage here.
> But feel free to pick-up different app and prove me wrong.
> After all we have plenty of sample apps that do provide enough
> pressure on the cache: l3fwd-acl, ipsec-secgw.
> Or you can even apply these patches from Sean:
> https://patches.dpdk.org/project/dpdk/list/?series=20999
> to run l3fwd with configurable routes.
> That should help you to make it cache-bound.
> 
> > I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
> 
> And for other we are seeing double digit degradation.
> So far the whole idea doesn't look promising at all, at least to me.
> Konstantin


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-13  5:36     ` [PATCH v2 1/1] " Dharmik Thakkar
  2022-01-13 10:18       ` Jerin Jacob
  2022-01-20  8:21       ` Morten Brørup
@ 2022-01-23  7:13       ` Wang, Haiyue
  2 siblings, 0 replies; 52+ messages in thread
From: Wang, Haiyue @ 2022-01-23  7:13 UTC (permalink / raw)
  To: Dharmik Thakkar, Olivier Matz, Andrew Rybchenko
  Cc: dev, nd, honnappa.nagarahalli, ruifeng.wang, Morten Brørup,
	Ananyev, Konstantin, Richardson, Bruce, Xing, Beilei

> -----Original Message-----
> From: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Sent: Thursday, January 13, 2022 13:37
> To: Olivier Matz <olivier.matz@6wind.com>; Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
> Cc: dev@dpdk.org; nd@arm.com; honnappa.nagarahalli@arm.com; ruifeng.wang@arm.com; Dharmik Thakkar
> <dharmik.thakkar@arm.com>
> Subject: [PATCH v2 1/1] mempool: implement index-based per core cache
> 
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/mempool/rte_mempool.h             | 150 +++++++++++++++++++++++++-
>  lib/mempool/rte_mempool_ops_default.c |   7 ++
>  2 files changed, 156 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1e7a3c15273c..f2403fbc97a7 100644


> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
> index 22fccf9d7619..3543cad9d4ce 100644
> --- a/lib/mempool/rte_mempool_ops_default.c
> +++ b/lib/mempool/rte_mempool_ops_default.c
> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>  		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +		/* Store pool base value to calculate indices for index-based
> +		 * lcore cache implementation
> +		 */
> +		if (i == 0)
> +			mp->pool_base_value = obj;

This is wrong, the populate may run many times. ;-)

I tried bellow patch to run "rte_pktmbuf_pool_create(mbuf_pool_0, 1048575, 256, 0, 4096, 0)"

This is the debug message (also, your patch will make DPDK mempool not support > 4GB):

2bfffdb40 (from last debug line 'max') - 1b3fff240 (from first line 'base addr') = 10BFFE900

****mempool mbuf_pool_0 (size = 1048575, populated_size = 46952, elt_size = 4224): base addr = 0x1b3fff240, max = 0x0, diff = 18446744066394688960 (max_objs = 1048575)
****mempool mbuf_pool_0 (size = 1048575, populated_size = 297358, elt_size = 4224): base addr = 0x1c0000040, max = 0x0, diff = 18446744066193358784 (max_objs = 1001623)
****mempool mbuf_pool_0 (size = 1048575, populated_size = 547764, elt_size = 4224): base addr = 0x200000040, max = 0x0, diff = 18446744065119616960 (max_objs = 751217)
****mempool mbuf_pool_0 (size = 1048575, populated_size = 798170, elt_size = 4224): base addr = 0x240000040, max = 0x0, diff = 18446744064045875136 (max_objs = 500811)
****mempool mbuf_pool_0 (size = 1048575, populated_size = 1048575, elt_size = 4224): base addr = 0x280000040, max = 0x2bfffdb40, diff = 1073732352 (max_objs = 250405)

diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
index 22fccf9d76..854067cd43 100644
--- a/lib/mempool/rte_mempool_ops_default.c
+++ b/lib/mempool/rte_mempool_ops_default.c
@@ -99,6 +99,7 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
        unsigned int i;
        void *obj;
        int ret;
+       void *pool_base_value = NULL, *pool_max_value = NULL;

        ret = rte_mempool_get_page_size(mp, &pg_sz);
        if (ret < 0)
@@ -128,9 +129,20 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
                obj_cb(mp, obj_cb_arg, obj,
                       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
                rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
+               if (i == 0)
+                       pool_base_value = obj;
+               else if (i == (max_objs - 1))
+                       pool_max_value = obj;
                off += mp->elt_size + mp->trailer_size;
        }

+       printf("****mempool %s (size = %u, populated_size = %u, elt_size = %u): base addr = 0x%llx, max = 0x%llx, diff = %lu (max_objs = %u)\n",
+               mp->name, mp->size, mp->populated_size,
+               mp->elt_size,
+               (unsigned long long) pool_base_value,
+               (unsigned long long) pool_max_value,
+               RTE_PTR_DIFF(pool_max_value, pool_base_value), max_objs);
+
        return i;
 }


> +#endif
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>  		off += mp->elt_size + mp->trailer_size;
>  	}
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v2 1/1] mempool: implement index-based per core cache
  2022-01-21  7:36           ` Morten Brørup
@ 2022-01-24 13:05             ` Ray Kinsella
  0 siblings, 0 replies; 52+ messages in thread
From: Ray Kinsella @ 2022-01-24 13:05 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Honnappa Nagarahalli, Dharmik Thakkar, Olivier Matz,
	Andrew Rybchenko, dev, Ruifeng Wang, Beilei Xing, nd


Morten Brørup <mb@smartsharesystems.com> writes:

> +Ray Kinsella, ABI Policy maintainer
>
>> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
>> Sent: Friday, 21 January 2022 07.01
>> 
>> >
>> > +CC Beilei as i40e maintainer
>> >
>> > > From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
>> > > Sent: Thursday, 13 January 2022 06.37
>> > >
>> > > Current mempool per core cache implementation stores pointers to
>> mbufs
>> > > On 64b architectures, each pointer consumes 8B This patch replaces
>> it
>> > > with index-based implementation, where in each buffer is addressed
>> by
>> > > (pool base address + index) It reduces the amount of memory/cache
>> > > required for per core cache
>> > >
>> > > L3Fwd performance testing reveals minor improvements in the cache
>> > > performance (L1 and L2 misses reduced by 0.60%) with no change in
>> > > throughput
>> > >
>> > > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
>> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> > > ---
>> > >  lib/mempool/rte_mempool.h             | 150
>> +++++++++++++++++++++++++-
>> > >  lib/mempool/rte_mempool_ops_default.c |   7 ++
>> > >  2 files changed, 156 insertions(+), 1 deletion(-)
>> > >
>> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>> > > index 1e7a3c15273c..f2403fbc97a7 100644
>> > > --- a/lib/mempool/rte_mempool.h
>> > > +++ b/lib/mempool/rte_mempool.h
>> > > @@ -50,6 +50,10 @@
>> > >  #include <rte_memcpy.h>
>> > >  #include <rte_common.h>
>> > >
>> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> > > +#include <rte_vect.h>
>> > > +#endif
>> > > +
>> > >  #include "rte_mempool_trace_fp.h"
>> > >
>> > >  #ifdef __cplusplus
>> > > @@ -239,6 +243,9 @@ struct rte_mempool {
>> > >  	int32_t ops_index;
>> > >
>> > >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache
>> */
>> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> > > +	void *pool_base_value; /**< Base value to calculate indices */
>> > > +#endif
>> > >
>> > >  	uint32_t populated_size;         /**< Number of populated
>> > > objects. */
>> > >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
>> > > pool */ @@ -1314,7 +1321,22 @@ rte_mempool_cache_flush(struct
>> > > rte_mempool_cache *cache,
>> > >  	if (cache == NULL || cache->len == 0)
>> > >  		return;
>> > >  	rte_mempool_trace_cache_flush(cache, mp);
>> > > +
>> > > +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> > > +	unsigned int i;
>> > > +	unsigned int cache_len = cache->len;
>> > > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
>> > > +	void *base_value = mp->pool_base_value;
>> > > +	uint32_t *cache_objs = (uint32_t *) cache->objs;
>> >
>> > Hi Dharmik and Honnappa,
>> >
>> > The essence of this patch is based on recasting the type of the objs
>> field in the
>> > rte_mempool_cache structure from an array of pointers to an array of
>> > uint32_t.
>> >
>> > However, this effectively breaks the ABI, because the
>> rte_mempool_cache
>> > structure is public and part of the API.
>> The patch does not change the public structure, the new member is under
>> compile time flag, not sure how it breaks the ABI.
>> 
>> >
>> > Some drivers [1] even bypass the mempool API and access the
>> > rte_mempool_cache structure directly, assuming that the objs array in
>> the
>> > cache is an array of pointers. So you cannot recast the fields in the
>> > rte_mempool_cache structure the way this patch requires.
>> IMO, those drivers are at fault. The mempool cache structure is public
>> only because the APIs are inline. We should still maintain modularity
>> and not use the members of structures belonging to another library
>> directly. A similar effort involving rte_ring was not accepted sometime
>> back [1]
>> 
>> [1]
>> http://inbox.dpdk.org/dev/DBAPR08MB5814907968595EE56F5E20A798390@DBAPR0
>> 8MB5814.eurprd08.prod.outlook.com/
>> 
>> >
>> > Although I do consider bypassing an API's accessor functions
>> "spaghetti
>> > code", this driver's behavior is formally acceptable as long as the
>> > rte_mempool_cache structure is not marked as internal.
>> >
>> > I really liked your idea of using indexes instead of pointers, so I'm
>> very sorry to
>> > shoot it down. :-(
>> >
>> > [1]: E.g. the Intel i40e PMD,
>> >
>> http://code.dpdk.org/dpdk/latest/source/drivers/net/i40e/i40e_rxtx_vec_
>> avx
>> > 512.c#L25
>> It is possible to throw an error when this feature is enabled in this
>> file. Alternatively, this PMD could implement the code for index based
>> mempool.
>> 
>
> I agree with both your points, Honnappa.
>
> The ABI remains intact, and only changes when this feature is enabled at compile time.
>
> In addition to your suggestions, I propose that the patch modifies the objs type in the mempool cache structure itself, instead of type casting it through an access variable. This should throw an error when compiling an application that accesses it as a pointer array instead of a uint32_t array - like the affected Intel PMDs.
>
> The updated objs field in the mempool cache structure should have the same size when compiled as the original objs field, so this feature doesn't change anything else in the ABI, only the type of the mempool cache objects.
>
> Also, the description of the feature should stress that applications accessing the cache objects directly will fail miserably.

Thanks for CC'ing me Morten.

My 2c is that, I would be slow in supporting this patch as it introduces
code paths that are harder (impossible?) to test regularly. So yes, it
is optional, in that case are we just adding automatically dead code -
I would ask, if a runtime option not make more sense for this?

Also we can't automatically assume what the PMD's are doing are breaking
an unwritten rule (breaking abstractions) - I would guess these are
doing it for solid performance reasons. If so that would futher support
my point about making the mempool runtime configurable and query-able
(is this mempool a bucket of indexes or pointers etc), and enabling the
PMDs to ask rather than assume.

Like Morten, I like the idea, saving memory and reducing cache misses
with indexes, this is all good IMHO.

-- 
Regards, Ray K

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 1/1] mempool: implement index-based per core cache
  2022-01-21 11:25             ` Ananyev, Konstantin
  2022-01-21 11:31               ` Ananyev, Konstantin
@ 2022-03-24 19:51               ` Dharmik Thakkar
  1 sibling, 0 replies; 52+ messages in thread
From: Dharmik Thakkar @ 2022-03-24 19:51 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Andrew Rybchenko, dev, nd, Honnappa Nagarahalli,
	Ruifeng Wang

Hi,

Thank you for the comments!

Based on the suggestions, I tested the patch for single core L3Fwd performance with increased number of routes/flows (maximum 8K) to increase cache footprint.
However, I don’t see much improvement with the patch.

> On Jan 21, 2022, at 5:25 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> 
> 
> Hi Dharmik,
>>> 
>>>>> 
>>>>>> Current mempool per core cache implementation stores pointers to mbufs
>>>>>> On 64b architectures, each pointer consumes 8B
>>>>>> This patch replaces it with index-based implementation,
>>>>>> where in each buffer is addressed by (pool base address + index)
>>>>>> It reduces the amount of memory/cache required for per core cache
>>>>>> 
>>>>>> L3Fwd performance testing reveals minor improvements in the cache
>>>>>> performance (L1 and L2 misses reduced by 0.60%)
>>>>>> with no change in throughput
>>>>> 
>>>>> I feel really sceptical about that patch and the whole idea in general:
>>>>> - From what I read above there is no real performance improvement observed.
>>>>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>>>>> see below for more details).
>>>> 
>>>> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
>>>> Similar optimizations can be implemented for x86 platforms which should close the performance gap
>>>> and in my understanding should give better performance for a bulk size of 32.
>>> 
>>> Might be, but I still don't see the reason for such effort.
>>> As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
>>> on ARM64 even with vectorized version of the code.
>>> 
>> 
>> IMO, even without performance improvement, it is advantageous because the same performance is being achieved
>> with less memory and cache utilization using the patch.
>> 
>>>>> - Space utilization difference looks neglectable too.
>>>> 
>>>> Sorry, I did not understand this point.
>>> 
>>> As I understand one of the expectations from that patch was:
>>> reduce memory/cache required, which should improve cache utilization
>>> (less misses, etc.).
>>> Though I think such improvements would be neglectable and wouldn't
>>> cause any real performance gain.
>> 
>> The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
>> Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
> 
> It saves you 4 just bytes per mbuf.
> Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
> Let's do a simplistic estimation of  number of affected cache-lines l for l2fwd. 
> For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
> 
>                                                                       number of cache-lines accessed 
>                                                                   cache with pointers / cache with indexes 
> mempool_get:                                            (32*8)/64=4          /  (32*4)/64=2
> RX (read HW desc):                                    (32*16)/64=8       /   (32*16)/64=8
> RX (write mbuf fields, 1st cache line):    (32*64)/64=3       /   (32*64)/64=32
> update mac addrs:                                     (32*64)/64=32     /   (32*64)/64=32   
> TX (write HW desc):                                   (32*16)/64=8       /   (32*16)/64=8
> free mbufs (read 2nd mbuf cache line): (32*64)/64=32    /   (32*64)/64=32   
> mempool_put:                                            (32*8)/64=4        /    (32*4)/64=2
> total:                                                             120                             116
> 
> So, if my calculations are correct, max estimated gain for cache utilization would be:
> (120-116)*100/120=3.33% 
> Note that numbers are for over-simplistic usage scenario.
> In more realistic ones, when we have to touch more cache-lines per packet,
> that difference would be even less noticeable.
> So I really doubt we will see some noticeable improvements in terms of cache utilization
> with that patch.
> 
>>> 
>>>>> - The change introduces a new build time config option with a major limitation:
>>>>> All memzones in a pool have to be within the same 4GB boundary.
>>>>> To address it properly, extra changes will be required in init(/populate) part of the code.
>>>> 
>>>> I agree to the above mentioned challenges and I am currently working on resolving these issues.
>>> 
>>> I still think that to justify such changes some really noticeable performance
>>> improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
>>> Otherwise it just not worth the hassle.
>>> 
>> 
>> Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
>> That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
> 
> As I said above, I don’t think we'll see any real advantage here.
> But feel free to pick-up different app and prove me wrong.
> After all we have plenty of sample apps that do provide enough
> pressure on the cache: l3fwd-acl, ipsec-secgw.
> Or you can even apply these patches from Sean:
> https://patches.dpdk.org/project/dpdk/list/?series=20999
> to run l3fwd with configurable routes.
> That should help you to make it cache-bound.
> 

Thank you, Konstantin! This patch was helpful.

>> I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
> 
> And for other we are seeing double digit degradation.
> So far the whole idea doesn't look promising at all, at least to me.
> Konstantin
> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 0/1] mempool: implement index-based per core cache
  2022-01-13  5:31               ` Dharmik Thakkar
@ 2023-07-06 17:43                 ` Stephen Hemminger
  2023-07-31 12:23                   ` Thomas Monjalon
  0 siblings, 1 reply; 52+ messages in thread
From: Stephen Hemminger @ 2023-07-06 17:43 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: Jerin Jacob, Morten Brørup, Bruce Richardson,
	Honnappa Nagarahalli, dpdk-dev, nd, Ruifeng Wang

On Thu, 13 Jan 2022 05:31:18 +0000
Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:

> Hi,
> 
> Thank you for your valuable review comments and suggestions!
> 
> I will be sending out a v2 in which I have increased the size of the mempool to 32GB by using division by sizeof(uintptr_t).
> However, I am seeing ~5% performance degradation with mempool_perf_autotest (for bulk size of 32) with this change
> when compared to the base performance.
> Earlier, without this change, I was seeing an improvement of ~13% compared to base performance. So, this is a significant degradation.
> I would appreciate your review comments on v2.
> 
> Thank you!
> 
> > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > 
> > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com> wrote:  
> >>   
> >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> >>> Sent: Friday, 7 January 2022 14.51
> >>> 
> >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:  
> >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> >>>>> Sent: Friday, 7 January 2022 12.16
> >>>>> 
> >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:  
> >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:  
> >>>>> Friday, 24  
> >>>>>>> December 2021 23.59
> >>>>>>> 
> >>>>>>> Current mempool per core cache implementation stores pointers  
> >>> to  
> >>>>> mbufs  
> >>>>>>> On 64b architectures, each pointer consumes 8B This patch  
> >>> replaces  
> >>>>> it  
> >>>>>>> with index-based implementation, where in each buffer is  
> >>> addressed  
> >>>>> by  
> >>>>>>> (pool base address + index) It reduces the amount of  
> >>> memory/cache  
> >>>>>>> required for per core cache
> >>>>>>> 
> >>>>>>> L3Fwd performance testing reveals minor improvements in the  
> >>> cache  
> >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no change  
> >>> in  
> >>>>>>> throughput
> >>>>>>> 
> >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows  
> >>>>> significant  
> >>>>>>> improvement with majority of the test cases
> >>>>>>>   
> >>>>>> 
> >>>>>> I still think this is very interesting. And your performance  
> >>> numbers  
> >>>>> are  
> >>>>>> looking good.
> >>>>>> 
> >>>>>> However, it limits the size of a mempool to 4 GB. As previously
> >>>>>> discussed, the max mempool size can be increased by multiplying  
> >>> the  
> >>>>> index  
> >>>>>> with a constant.
> >>>>>> 
> >>>>>> I would suggest using sizeof(uintptr_t) as the constant  
> >>> multiplier,  
> >>>>> so  
> >>>>>> the mempool can hold objects of any size divisible by  
> >>>>> sizeof(uintptr_t).  
> >>>>>> And it would be silly to use a mempool to hold objects smaller  
> >>> than  
> >>>>>> sizeof(uintptr_t).
> >>>>>> 
> >>>>>> How does the performance look if you multiply the index by
> >>>>>> sizeof(uintptr_t)?
> >>>>>>   
> >>>>> 
> >>>>> Each mempool entry is cache aligned, so we can use that if we want  
> >>> a  
> >>>>> bigger
> >>>>> multiplier.  
> >>>> 
> >>>> Thanks for chiming in, Bruce.
> >>>> 
> >>>> Please also read this discussion about the multiplier:
> >>>> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
> >>>>   
> >>> 
> >>> I actually wondered after I had sent the email whether we had indeed an
> >>> option to disable the cache alignment or not! Thanks for pointing out
> >>> that
> >>> we do. This brings a couple additional thoughts:
> >>> 
> >>> * Using indexes for the cache should probably be a runtime flag rather
> >>> than
> >>>  a build-time one.
> >>> * It would seem reasonable to me to disallow use of the indexed-cache
> >>> flag
> >>>  and the non-cache aligned flag simultaneously.
> >>> * On the offchance that that restriction is unacceptable, then we can
> >>>  make things a little more complicated by doing a runtime computation
> >>> of
> >>>  the "index-shiftwidth" to use.
> >>> 
> >>> Overall, I think defaulting to cacheline shiftwidth and disallowing
> >>> index-based addressing when using unaligned buffers is simplest and
> >>> easiest
> >>> unless we can come up with a valid usecase for needing more than that.
> >>> 
> >>> /Bruce  
> >> 
> >> This feature is a performance optimization.
> >> 
> >> With that in mind, it should not introduce function pointers or similar run-time checks or in the fast path, to determine what kind of cache to use per mempool. And if an index multiplier is implemented, it should be a compile time constant, probably something between sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> >> 
> >> The patch comes with a tradeoff between better performance and limited mempool size, and possibly some limitations regarding very small objects that are not cache line aligned to avoid wasting memory (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> >> 
> >> With no multiplier, the only tradeoff is that the mempool size is limited to 4 GB.
> >> 
> >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the mempool size is limited to 32 GB. (And a waste of memory for objects smaller than 8 byte; but I don't think anyone would use a mempool to hold objects smaller than 8 byte.)
> >> 
> >> If the multiplier is larger (i.e. 64 bytes cache line size), the mempool size is instead limited to 256 GB, but RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> >> 
> >> Note: 32 bit platforms have no benefit from this patch: The pointer already only uses 4 bytes, so replacing the pointer with a 4 byte index makes no difference.
> >> 
> >> 
> >> Since this feature is a performance optimization only, and doesn't provide any new features, I don't mind it being a compile time option.
> >> 
> >> If this feature is a compile time option, and the mempool library is compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the public header file, so compilation of applications using the flag will fail. And rte_mempool_create() could RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags parameter, or emit a warning about the flag being ignored. Obviously, rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger than the library supports, possibly emitting a message that the mempool library should be built without this feature to support the larger mempool.
> >> 
> >> Here is another thought: If only exotic applications use mempools larger than 32 GB, this would be a generally acceptable limit, and DPDK should use index-based cache as default, making the opposite (i.e. pointer-based cache) a compile time option instead. A similar decision was recently made for limiting the RTE_MAX_LCORE default.
> >> 
> >> 
> >> Although DPDK is moving away from compile time options in order to better support Linux distros, there should be a general exception for performance and memory optimizations. Otherwise, network appliance vendors will inherit the increasing amount of DPDK bloat, and we (network appliance vendors) will eventually be forced to fork DPDK to get rid of the bloat and achieve the goals originally intended by DPDK.  
> > 
> > Agree with Morten's view on this.
> >   
> >> If anyone disagrees with the principle about a general exception for performance and memory optimizations, I would like to pass on the decision to the Techboard!
> >>   

NAK
Having compile time stuff like this means one side or the other is not tested
by CI infrastructure.  There never was sufficient justification, and lots of objections.
Dropping the patch.


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH 0/1] mempool: implement index-based per core cache
  2023-07-06 17:43                 ` Stephen Hemminger
@ 2023-07-31 12:23                   ` Thomas Monjalon
  2023-07-31 12:33                     ` Morten Brørup
  0 siblings, 1 reply; 52+ messages in thread
From: Thomas Monjalon @ 2023-07-31 12:23 UTC (permalink / raw)
  To: Dharmik Thakkar
  Cc: dev, Jerin Jacob, Morten Brørup, Bruce Richardson,
	Honnappa Nagarahalli, nd, Ruifeng Wang, Stephen Hemminger,
	olivier.matz, andrew.rybchenko

The v2 was not sent, and Stephen dropped the patch from patchwork.

Do we abandon this feature?
Should I remove it from the roadmap?


06/07/2023 19:43, Stephen Hemminger:
> On Thu, 13 Jan 2022 05:31:18 +0000
> Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:
> 
> > Hi,
> > 
> > Thank you for your valuable review comments and suggestions!
> > 
> > I will be sending out a v2 in which I have increased the size of the mempool to 32GB by using division by sizeof(uintptr_t).
> > However, I am seeing ~5% performance degradation with mempool_perf_autotest (for bulk size of 32) with this change
> > when compared to the base performance.
> > Earlier, without this change, I was seeing an improvement of ~13% compared to base performance. So, this is a significant degradation.
> > I would appreciate your review comments on v2.
> > 
> > Thank you!
> > 
> > > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > 
> > > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com> wrote:  
> > >>   
> > >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > >>> Sent: Friday, 7 January 2022 14.51
> > >>> 
> > >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:  
> > >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > >>>>> Sent: Friday, 7 January 2022 12.16
> > >>>>> 
> > >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:  
> > >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:  
> > >>>>> Friday, 24  
> > >>>>>>> December 2021 23.59
> > >>>>>>> 
> > >>>>>>> Current mempool per core cache implementation stores pointers  
> > >>> to  
> > >>>>> mbufs  
> > >>>>>>> On 64b architectures, each pointer consumes 8B This patch  
> > >>> replaces  
> > >>>>> it  
> > >>>>>>> with index-based implementation, where in each buffer is  
> > >>> addressed  
> > >>>>> by  
> > >>>>>>> (pool base address + index) It reduces the amount of  
> > >>> memory/cache  
> > >>>>>>> required for per core cache
> > >>>>>>> 
> > >>>>>>> L3Fwd performance testing reveals minor improvements in the  
> > >>> cache  
> > >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no change  
> > >>> in  
> > >>>>>>> throughput
> > >>>>>>> 
> > >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows  
> > >>>>> significant  
> > >>>>>>> improvement with majority of the test cases
> > >>>>>>>   
> > >>>>>> 
> > >>>>>> I still think this is very interesting. And your performance  
> > >>> numbers  
> > >>>>> are  
> > >>>>>> looking good.
> > >>>>>> 
> > >>>>>> However, it limits the size of a mempool to 4 GB. As previously
> > >>>>>> discussed, the max mempool size can be increased by multiplying  
> > >>> the  
> > >>>>> index  
> > >>>>>> with a constant.
> > >>>>>> 
> > >>>>>> I would suggest using sizeof(uintptr_t) as the constant  
> > >>> multiplier,  
> > >>>>> so  
> > >>>>>> the mempool can hold objects of any size divisible by  
> > >>>>> sizeof(uintptr_t).  
> > >>>>>> And it would be silly to use a mempool to hold objects smaller  
> > >>> than  
> > >>>>>> sizeof(uintptr_t).
> > >>>>>> 
> > >>>>>> How does the performance look if you multiply the index by
> > >>>>>> sizeof(uintptr_t)?
> > >>>>>>   
> > >>>>> 
> > >>>>> Each mempool entry is cache aligned, so we can use that if we want  
> > >>> a  
> > >>>>> bigger
> > >>>>> multiplier.  
> > >>>> 
> > >>>> Thanks for chiming in, Bruce.
> > >>>> 
> > >>>> Please also read this discussion about the multiplier:
> > >>>> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
> > >>>>   
> > >>> 
> > >>> I actually wondered after I had sent the email whether we had indeed an
> > >>> option to disable the cache alignment or not! Thanks for pointing out
> > >>> that
> > >>> we do. This brings a couple additional thoughts:
> > >>> 
> > >>> * Using indexes for the cache should probably be a runtime flag rather
> > >>> than
> > >>>  a build-time one.
> > >>> * It would seem reasonable to me to disallow use of the indexed-cache
> > >>> flag
> > >>>  and the non-cache aligned flag simultaneously.
> > >>> * On the offchance that that restriction is unacceptable, then we can
> > >>>  make things a little more complicated by doing a runtime computation
> > >>> of
> > >>>  the "index-shiftwidth" to use.
> > >>> 
> > >>> Overall, I think defaulting to cacheline shiftwidth and disallowing
> > >>> index-based addressing when using unaligned buffers is simplest and
> > >>> easiest
> > >>> unless we can come up with a valid usecase for needing more than that.
> > >>> 
> > >>> /Bruce  
> > >> 
> > >> This feature is a performance optimization.
> > >> 
> > >> With that in mind, it should not introduce function pointers or similar run-time checks or in the fast path, to determine what kind of cache to use per mempool. And if an index multiplier is implemented, it should be a compile time constant, probably something between sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> > >> 
> > >> The patch comes with a tradeoff between better performance and limited mempool size, and possibly some limitations regarding very small objects that are not cache line aligned to avoid wasting memory (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> > >> 
> > >> With no multiplier, the only tradeoff is that the mempool size is limited to 4 GB.
> > >> 
> > >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the mempool size is limited to 32 GB. (And a waste of memory for objects smaller than 8 byte; but I don't think anyone would use a mempool to hold objects smaller than 8 byte.)
> > >> 
> > >> If the multiplier is larger (i.e. 64 bytes cache line size), the mempool size is instead limited to 256 GB, but RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> > >> 
> > >> Note: 32 bit platforms have no benefit from this patch: The pointer already only uses 4 bytes, so replacing the pointer with a 4 byte index makes no difference.
> > >> 
> > >> 
> > >> Since this feature is a performance optimization only, and doesn't provide any new features, I don't mind it being a compile time option.
> > >> 
> > >> If this feature is a compile time option, and the mempool library is compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the public header file, so compilation of applications using the flag will fail. And rte_mempool_create() could RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags parameter, or emit a warning about the flag being ignored. Obviously, rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger than the library supports, possibly emitting a message that the mempool library should be built without this feature to support the larger mempool.
> > >> 
> > >> Here is another thought: If only exotic applications use mempools larger than 32 GB, this would be a generally acceptable limit, and DPDK should use index-based cache as default, making the opposite (i.e. pointer-based cache) a compile time option instead. A similar decision was recently made for limiting the RTE_MAX_LCORE default.
> > >> 
> > >> 
> > >> Although DPDK is moving away from compile time options in order to better support Linux distros, there should be a general exception for performance and memory optimizations. Otherwise, network appliance vendors will inherit the increasing amount of DPDK bloat, and we (network appliance vendors) will eventually be forced to fork DPDK to get rid of the bloat and achieve the goals originally intended by DPDK.  
> > > 
> > > Agree with Morten's view on this.
> > >   
> > >> If anyone disagrees with the principle about a general exception for performance and memory optimizations, I would like to pass on the decision to the Techboard!
> > >>   
> 
> NAK
> Having compile time stuff like this means one side or the other is not tested
> by CI infrastructure.  There never was sufficient justification, and lots of objections.
> Dropping the patch.
> 
> 






^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 0/1] mempool: implement index-based per core cache
  2023-07-31 12:23                   ` Thomas Monjalon
@ 2023-07-31 12:33                     ` Morten Brørup
  2023-07-31 14:57                       ` Dharmik Jayesh Thakkar
  0 siblings, 1 reply; 52+ messages in thread
From: Morten Brørup @ 2023-07-31 12:33 UTC (permalink / raw)
  To: Thomas Monjalon, Dharmik Thakkar
  Cc: dev, Jerin Jacob, Bruce Richardson, Honnappa Nagarahalli, nd,
	Ruifeng Wang, Stephen Hemminger, olivier.matz, andrew.rybchenko

> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Monday, 31 July 2023 14.24
> 
> The v2 was not sent, and Stephen dropped the patch from patchwork.
> 
> Do we abandon this feature?

+1, because I think that the zero-copy mempool cache access functions make this patch irrelevant.

> Should I remove it from the roadmap?

+1

> 
> 
> 06/07/2023 19:43, Stephen Hemminger:
> > On Thu, 13 Jan 2022 05:31:18 +0000
> > Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:
> >
> > > Hi,
> > >
> > > Thank you for your valuable review comments and suggestions!
> > >
> > > I will be sending out a v2 in which I have increased the size of the
> mempool to 32GB by using division by sizeof(uintptr_t).
> > > However, I am seeing ~5% performance degradation with
> mempool_perf_autotest (for bulk size of 32) with this change
> > > when compared to the base performance.
> > > Earlier, without this change, I was seeing an improvement of ~13% compared
> to base performance. So, this is a significant degradation.
> > > I would appreciate your review comments on v2.
> > >
> > > Thank you!
> > >
> > > > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > >
> > > > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com>
> wrote:
> > > >>
> > > >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > >>> Sent: Friday, 7 January 2022 14.51
> > > >>>
> > > >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > > >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > >>>>> Sent: Friday, 7 January 2022 12.16
> > > >>>>>
> > > >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> > > >>>>> Friday, 24
> > > >>>>>>> December 2021 23.59
> > > >>>>>>>
> > > >>>>>>> Current mempool per core cache implementation stores pointers
> > > >>> to
> > > >>>>> mbufs
> > > >>>>>>> On 64b architectures, each pointer consumes 8B This patch
> > > >>> replaces
> > > >>>>> it
> > > >>>>>>> with index-based implementation, where in each buffer is
> > > >>> addressed
> > > >>>>> by
> > > >>>>>>> (pool base address + index) It reduces the amount of
> > > >>> memory/cache
> > > >>>>>>> required for per core cache
> > > >>>>>>>
> > > >>>>>>> L3Fwd performance testing reveals minor improvements in the
> > > >>> cache
> > > >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no change
> > > >>> in
> > > >>>>>>> throughput
> > > >>>>>>>
> > > >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows
> > > >>>>> significant
> > > >>>>>>> improvement with majority of the test cases
> > > >>>>>>>
> > > >>>>>>
> > > >>>>>> I still think this is very interesting. And your performance
> > > >>> numbers
> > > >>>>> are
> > > >>>>>> looking good.
> > > >>>>>>
> > > >>>>>> However, it limits the size of a mempool to 4 GB. As previously
> > > >>>>>> discussed, the max mempool size can be increased by multiplying
> > > >>> the
> > > >>>>> index
> > > >>>>>> with a constant.
> > > >>>>>>
> > > >>>>>> I would suggest using sizeof(uintptr_t) as the constant
> > > >>> multiplier,
> > > >>>>> so
> > > >>>>>> the mempool can hold objects of any size divisible by
> > > >>>>> sizeof(uintptr_t).
> > > >>>>>> And it would be silly to use a mempool to hold objects smaller
> > > >>> than
> > > >>>>>> sizeof(uintptr_t).
> > > >>>>>>
> > > >>>>>> How does the performance look if you multiply the index by
> > > >>>>>> sizeof(uintptr_t)?
> > > >>>>>>
> > > >>>>>
> > > >>>>> Each mempool entry is cache aligned, so we can use that if we want
> > > >>> a
> > > >>>>> bigger
> > > >>>>> multiplier.
> > > >>>>
> > > >>>> Thanks for chiming in, Bruce.
> > > >>>>
> > > >>>> Please also read this discussion about the multiplier:
> > > >>>>
> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@
> mail.gmail.com/
> > > >>>>
> > > >>>
> > > >>> I actually wondered after I had sent the email whether we had indeed
> an
> > > >>> option to disable the cache alignment or not! Thanks for pointing out
> > > >>> that
> > > >>> we do. This brings a couple additional thoughts:
> > > >>>
> > > >>> * Using indexes for the cache should probably be a runtime flag rather
> > > >>> than
> > > >>>  a build-time one.
> > > >>> * It would seem reasonable to me to disallow use of the indexed-cache
> > > >>> flag
> > > >>>  and the non-cache aligned flag simultaneously.
> > > >>> * On the offchance that that restriction is unacceptable, then we can
> > > >>>  make things a little more complicated by doing a runtime computation
> > > >>> of
> > > >>>  the "index-shiftwidth" to use.
> > > >>>
> > > >>> Overall, I think defaulting to cacheline shiftwidth and disallowing
> > > >>> index-based addressing when using unaligned buffers is simplest and
> > > >>> easiest
> > > >>> unless we can come up with a valid usecase for needing more than that.
> > > >>>
> > > >>> /Bruce
> > > >>
> > > >> This feature is a performance optimization.
> > > >>
> > > >> With that in mind, it should not introduce function pointers or similar
> run-time checks or in the fast path, to determine what kind of cache to use
> per mempool. And if an index multiplier is implemented, it should be a compile
> time constant, probably something between sizeof(uintptr_t) or
> RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> > > >>
> > > >> The patch comes with a tradeoff between better performance and limited
> mempool size, and possibly some limitations regarding very small objects that
> are not cache line aligned to avoid wasting memory
> (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> > > >>
> > > >> With no multiplier, the only tradeoff is that the mempool size is
> limited to 4 GB.
> > > >>
> > > >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the
> mempool size is limited to 32 GB. (And a waste of memory for objects smaller
> than 8 byte; but I don't think anyone would use a mempool to hold objects
> smaller than 8 byte.)
> > > >>
> > > >> If the multiplier is larger (i.e. 64 bytes cache line size), the
> mempool size is instead limited to 256 GB, but
> RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> > > >>
> > > >> Note: 32 bit platforms have no benefit from this patch: The pointer
> already only uses 4 bytes, so replacing the pointer with a 4 byte index makes
> no difference.
> > > >>
> > > >>
> > > >> Since this feature is a performance optimization only, and doesn't
> provide any new features, I don't mind it being a compile time option.
> > > >>
> > > >> If this feature is a compile time option, and the mempool library is
> compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ
> could be made undefined in the public header file, so compilation of
> applications using the flag will fail. And rte_mempool_create() could
> RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags
> parameter, or emit a warning about the flag being ignored. Obviously,
> rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger
> than the library supports, possibly emitting a message that the mempool
> library should be built without this feature to support the larger mempool.
> > > >>
> > > >> Here is another thought: If only exotic applications use mempools
> larger than 32 GB, this would be a generally acceptable limit, and DPDK should
> use index-based cache as default, making the opposite (i.e. pointer-based
> cache) a compile time option instead. A similar decision was recently made for
> limiting the RTE_MAX_LCORE default.
> > > >>
> > > >>
> > > >> Although DPDK is moving away from compile time options in order to
> better support Linux distros, there should be a general exception for
> performance and memory optimizations. Otherwise, network appliance vendors
> will inherit the increasing amount of DPDK bloat, and we (network appliance
> vendors) will eventually be forced to fork DPDK to get rid of the bloat and
> achieve the goals originally intended by DPDK.
> > > >
> > > > Agree with Morten's view on this.
> > > >
> > > >> If anyone disagrees with the principle about a general exception for
> performance and memory optimizations, I would like to pass on the decision to
> the Techboard!
> > > >>
> >
> > NAK
> > Having compile time stuff like this means one side or the other is not
> tested
> > by CI infrastructure.  There never was sufficient justification, and lots of
> objections.
> > Dropping the patch.
> >
> >
> 
> 
> 
> 


^ permalink raw reply	[flat|nested] 52+ messages in thread

* RE: [PATCH 0/1] mempool: implement index-based per core cache
  2023-07-31 12:33                     ` Morten Brørup
@ 2023-07-31 14:57                       ` Dharmik Jayesh Thakkar
  0 siblings, 0 replies; 52+ messages in thread
From: Dharmik Jayesh Thakkar @ 2023-07-31 14:57 UTC (permalink / raw)
  To: Morten Brørup, thomas
  Cc: dev, Jerin Jacob, Bruce Richardson, Honnappa Nagarahalli, nd,
	Ruifeng Wang, Stephen Hemminger, olivier.matz, andrew.rybchenko,
	nd



> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: Monday, July 31, 2023 7:33 AM
> To: thomas@monjalon.net; Dharmik Jayesh Thakkar
> <DharmikJayesh.Thakkar@arm.com>
> Cc: dev@dpdk.org; Jerin Jacob <jerinjacobk@gmail.com>; Bruce Richardson
> <bruce.richardson@intel.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>; Stephen Hemminger
> <stephen@networkplumber.org>; olivier.matz@6wind.com;
> andrew.rybchenko@oktetlabs.ru
> Subject: RE: [PATCH 0/1] mempool: implement index-based per core cache
>
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Monday, 31 July 2023 14.24
> >
> > The v2 was not sent, and Stephen dropped the patch from patchwork.
> >
> > Do we abandon this feature?
>
> +1, because I think that the zero-copy mempool cache access functions make
> this patch irrelevant.
>
> > Should I remove it from the roadmap?
>
> +1

V2 was sent (https://patches.dpdk.org/project/dpdk/patch/20220113053630.886638-1-dharmik.thakkar@arm.com/)
However, it is not relevant anymore and can be dropped. Thank you!

>
> >
> >
> > 06/07/2023 19:43, Stephen Hemminger:
> > > On Thu, 13 Jan 2022 05:31:18 +0000
> > > Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:
> > >
> > > > Hi,
> > > >
> > > > Thank you for your valuable review comments and suggestions!
> > > >
> > > > I will be sending out a v2 in which I have increased the size of
> > > > the
> > mempool to 32GB by using division by sizeof(uintptr_t).
> > > > However, I am seeing ~5% performance degradation with
> > mempool_perf_autotest (for bulk size of 32) with this change
> > > > when compared to the base performance.
> > > > Earlier, without this change, I was seeing an improvement of ~13%
> > > > compared
> > to base performance. So, this is a significant degradation.
> > > > I would appreciate your review comments on v2.
> > > >
> > > > Thank you!
> > > >
> > > > > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com>
> wrote:
> > > > >
> > > > > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup
> > > > > <mb@smartsharesystems.com>
> > wrote:
> > > > >>
> > > > >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > >>> Sent: Friday, 7 January 2022 14.51
> > > > >>>
> > > > >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > > > >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > >>>>> Sent: Friday, 7 January 2022 12.16
> > > > >>>>>
> > > > >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > > >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
> Sent:
> > > > >>>>> Friday, 24
> > > > >>>>>>> December 2021 23.59
> > > > >>>>>>>
> > > > >>>>>>> Current mempool per core cache implementation stores
> > > > >>>>>>> pointers
> > > > >>> to
> > > > >>>>> mbufs
> > > > >>>>>>> On 64b architectures, each pointer consumes 8B This patch
> > > > >>> replaces
> > > > >>>>> it
> > > > >>>>>>> with index-based implementation, where in each buffer is
> > > > >>> addressed
> > > > >>>>> by
> > > > >>>>>>> (pool base address + index) It reduces the amount of
> > > > >>> memory/cache
> > > > >>>>>>> required for per core cache
> > > > >>>>>>>
> > > > >>>>>>> L3Fwd performance testing reveals minor improvements in
> > > > >>>>>>> the
> > > > >>> cache
> > > > >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no
> > > > >>>>>>> change
> > > > >>> in
> > > > >>>>>>> throughput
> > > > >>>>>>>
> > > > >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows
> > > > >>>>> significant
> > > > >>>>>>> improvement with majority of the test cases
> > > > >>>>>>>
> > > > >>>>>>
> > > > >>>>>> I still think this is very interesting. And your
> > > > >>>>>> performance
> > > > >>> numbers
> > > > >>>>> are
> > > > >>>>>> looking good.
> > > > >>>>>>
> > > > >>>>>> However, it limits the size of a mempool to 4 GB. As
> > > > >>>>>> previously discussed, the max mempool size can be increased
> > > > >>>>>> by multiplying
> > > > >>> the
> > > > >>>>> index
> > > > >>>>>> with a constant.
> > > > >>>>>>
> > > > >>>>>> I would suggest using sizeof(uintptr_t) as the constant
> > > > >>> multiplier,
> > > > >>>>> so
> > > > >>>>>> the mempool can hold objects of any size divisible by
> > > > >>>>> sizeof(uintptr_t).
> > > > >>>>>> And it would be silly to use a mempool to hold objects
> > > > >>>>>> smaller
> > > > >>> than
> > > > >>>>>> sizeof(uintptr_t).
> > > > >>>>>>
> > > > >>>>>> How does the performance look if you multiply the index by
> > > > >>>>>> sizeof(uintptr_t)?
> > > > >>>>>>
> > > > >>>>>
> > > > >>>>> Each mempool entry is cache aligned, so we can use that if
> > > > >>>>> we want
> > > > >>> a
> > > > >>>>> bigger
> > > > >>>>> multiplier.
> > > > >>>>
> > > > >>>> Thanks for chiming in, Bruce.
> > > > >>>>
> > > > >>>> Please also read this discussion about the multiplier:
> > > > >>>>
> >
> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY
> 95z9
> > xjbRuyA@
> > mail.gmail.com/
> > > > >>>>
> > > > >>>
> > > > >>> I actually wondered after I had sent the email whether we had
> > > > >>> indeed
> > an
> > > > >>> option to disable the cache alignment or not! Thanks for
> > > > >>> pointing out that we do. This brings a couple additional
> > > > >>> thoughts:
> > > > >>>
> > > > >>> * Using indexes for the cache should probably be a runtime
> > > > >>> flag rather than  a build-time one.
> > > > >>> * It would seem reasonable to me to disallow use of the
> > > > >>> indexed-cache flag  and the non-cache aligned flag
> > > > >>> simultaneously.
> > > > >>> * On the offchance that that restriction is unacceptable, then
> > > > >>> we can  make things a little more complicated by doing a
> > > > >>> runtime computation of  the "index-shiftwidth" to use.
> > > > >>>
> > > > >>> Overall, I think defaulting to cacheline shiftwidth and
> > > > >>> disallowing index-based addressing when using unaligned
> > > > >>> buffers is simplest and easiest unless we can come up with a
> > > > >>> valid usecase for needing more than that.
> > > > >>>
> > > > >>> /Bruce
> > > > >>
> > > > >> This feature is a performance optimization.
> > > > >>
> > > > >> With that in mind, it should not introduce function pointers or
> > > > >> similar
> > run-time checks or in the fast path, to determine what kind of cache
> > to use per mempool. And if an index multiplier is implemented, it
> > should be a compile time constant, probably something between
> > sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> > > > >>
> > > > >> The patch comes with a tradeoff between better performance and
> > > > >> limited
> > mempool size, and possibly some limitations regarding very small
> > objects that are not cache line aligned to avoid wasting memory
> > (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> > > > >>
> > > > >> With no multiplier, the only tradeoff is that the mempool size
> > > > >> is
> > limited to 4 GB.
> > > > >>
> > > > >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is
> > > > >> that the
> > mempool size is limited to 32 GB. (And a waste of memory for objects
> > smaller than 8 byte; but I don't think anyone would use a mempool to
> > hold objects smaller than 8 byte.)
> > > > >>
> > > > >> If the multiplier is larger (i.e. 64 bytes cache line size),
> > > > >> the
> > mempool size is instead limited to 256 GB, but
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> > > > >>
> > > > >> Note: 32 bit platforms have no benefit from this patch: The
> > > > >> pointer
> > already only uses 4 bytes, so replacing the pointer with a 4 byte
> > index makes no difference.
> > > > >>
> > > > >>
> > > > >> Since this feature is a performance optimization only, and
> > > > >> doesn't
> > provide any new features, I don't mind it being a compile time option.
> > > > >>
> > > > >> If this feature is a compile time option, and the mempool
> > > > >> library is
> > compiled with the large multiplier, then
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the
> public
> > header file, so compilation of applications using the flag will fail.
> > And rte_mempool_create() could
> > RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its
> > flags parameter, or emit a warning about the flag being ignored.
> > Obviously,
> > rte_mempool_create() should also RTE_ASSERT() that the mempool is not
> > larger than the library supports, possibly emitting a message that the
> > mempool library should be built without this feature to support the larger
> mempool.
> > > > >>
> > > > >> Here is another thought: If only exotic applications use
> > > > >> mempools
> > larger than 32 GB, this would be a generally acceptable limit, and
> > DPDK should use index-based cache as default, making the opposite
> > (i.e. pointer-based
> > cache) a compile time option instead. A similar decision was recently
> > made for limiting the RTE_MAX_LCORE default.
> > > > >>
> > > > >>
> > > > >> Although DPDK is moving away from compile time options in order
> > > > >> to
> > better support Linux distros, there should be a general exception for
> > performance and memory optimizations. Otherwise, network appliance
> > vendors will inherit the increasing amount of DPDK bloat, and we
> > (network appliance
> > vendors) will eventually be forced to fork DPDK to get rid of the
> > bloat and achieve the goals originally intended by DPDK.
> > > > >
> > > > > Agree with Morten's view on this.
> > > > >
> > > > >> If anyone disagrees with the principle about a general
> > > > >> exception for
> > performance and memory optimizations, I would like to pass on the
> > decision to the Techboard!
> > > > >>
> > >
> > > NAK
> > > Having compile time stuff like this means one side or the other is
> > > not
> > tested
> > > by CI infrastructure.  There never was sufficient justification, and
> > > lots of
> > objections.
> > > Dropping the patch.
> > >
> > >
> >
> >
> >
> >

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2023-07-31 14:57 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-30 17:27 [dpdk-dev] [RFC] mempool: implement index-based per core cache Dharmik Thakkar
2021-10-01 12:36 ` Jerin Jacob
2021-10-01 15:44   ` Honnappa Nagarahalli
2021-10-01 17:32     ` Jerin Jacob
2021-10-01 17:57       ` Honnappa Nagarahalli
2021-10-01 18:21       ` Jerin Jacob
2021-10-01 21:30 ` Ananyev, Konstantin
2021-10-02  0:07   ` Honnappa Nagarahalli
2021-10-02 18:51     ` Ananyev, Konstantin
2021-10-04 16:36       ` Honnappa Nagarahalli
2021-10-30 10:23         ` Morten Brørup
2021-10-31  8:14         ` Morten Brørup
2021-11-03 15:12           ` Dharmik Thakkar
2021-11-03 15:52             ` Morten Brørup
2021-11-04  4:42               ` Dharmik Thakkar
2021-11-04  8:04                 ` Morten Brørup
2021-11-08  4:32                   ` Honnappa Nagarahalli
2021-11-08  7:22                     ` Morten Brørup
2021-11-08 15:29                       ` Honnappa Nagarahalli
2021-11-08 15:39                         ` Morten Brørup
2021-11-08 15:46                           ` Honnappa Nagarahalli
2021-11-08 16:03                             ` Morten Brørup
2021-11-08 16:47                               ` Jerin Jacob
2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
2021-12-24 22:59   ` [PATCH 1/1] " Dharmik Thakkar
2022-01-11  2:26     ` Ananyev, Konstantin
2022-01-13  5:17       ` Dharmik Thakkar
2022-01-13 10:37         ` Ananyev, Konstantin
2022-01-19 15:32           ` Dharmik Thakkar
2022-01-21 11:25             ` Ananyev, Konstantin
2022-01-21 11:31               ` Ananyev, Konstantin
2022-03-24 19:51               ` Dharmik Thakkar
2021-12-25  0:16   ` [PATCH 0/1] " Morten Brørup
2022-01-07 11:15     ` Bruce Richardson
2022-01-07 11:29       ` Morten Brørup
2022-01-07 13:50         ` Bruce Richardson
2022-01-08  9:37           ` Morten Brørup
2022-01-10  6:38             ` Jerin Jacob
2022-01-13  5:31               ` Dharmik Thakkar
2023-07-06 17:43                 ` Stephen Hemminger
2023-07-31 12:23                   ` Thomas Monjalon
2023-07-31 12:33                     ` Morten Brørup
2023-07-31 14:57                       ` Dharmik Jayesh Thakkar
2022-01-13  5:36   ` [PATCH v2 " Dharmik Thakkar
2022-01-13  5:36     ` [PATCH v2 1/1] " Dharmik Thakkar
2022-01-13 10:18       ` Jerin Jacob
2022-01-20  8:21       ` Morten Brørup
2022-01-21  6:01         ` Honnappa Nagarahalli
2022-01-21  7:36           ` Morten Brørup
2022-01-24 13:05             ` Ray Kinsella
2022-01-21  9:12           ` Bruce Richardson
2022-01-23  7:13       ` Wang, Haiyue

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).