DPDK patches and discussions
 help / color / mirror / Atom feed
From: Dharmik Thakkar <dharmik.thakkar@arm.com>
To: Olivier Matz <olivier.matz@6wind.com>,
	Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Cc: dev@dpdk.org, nd@arm.com, honnappa.nagarahalli@arm.com,
	ruifeng.wang@arm.com, Dharmik Thakkar <dharmik.thakkar@arm.com>
Subject: [dpdk-dev] [RFC] mempool: implement index-based per core cache
Date: Thu, 30 Sep 2021 12:27:35 -0500
Message-ID: <20210930172735.2675627-1-dharmik.thakkar@arm.com> (raw)

Current mempool per core cache implementation is based on pointer
For most architectures, each pointer consumes 64b
Replace it with index-based implementation, where in each buffer
is addressed by (pool address + index)
It will reduce memory requirements

L3Fwd performance testing reveals minor improvements in the cache
performance and no change in throughput

Micro-benchmarking the patch using mempool_perf_test shows
significant improvement with majority of the test cases

Future plan involves replacing global pool's pointer-based implementation with index-based implementation

Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 drivers/mempool/ring/rte_mempool_ring.c |  2 +-
 lib/mempool/rte_mempool.c               |  8 +++
 lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
index b1f09ff28f4d..e55913e47f21 100644
--- a/drivers/mempool/ring/rte_mempool_ring.c
+++ b/drivers/mempool/ring/rte_mempool_ring.c
@@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
 		return -rte_errno;
 
 	mp->pool_data = r;
-
+	mp->local_cache_base_addr = &r[1];
 	return 0;
 }
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 59a588425bd6..424bdb19c323 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	int ret;
 	bool need_iova_contig_obj;
 	size_t max_alloc_size = SIZE_MAX;
+	unsigned lcore_id;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		}
 	}
 
+	/* Init all default caches. */
+	if (mp->cache_size != 0) {
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+			mp->local_cache[lcore_id].local_cache_base_value =
+				*(void **)mp->local_cache_base_addr;
+	}
+
 	rte_mempool_trace_populate_default(mp);
 	return mp->size;
 
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 4235d6f0bf2b..545405c0d3ce 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -51,6 +51,8 @@
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#include <arm_neon.h>
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -91,11 +93,12 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+	void *local_cache_base_value; /**< Base value to calculate indices */
 	/*
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 } __rte_cache_aligned;
 
 /**
@@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
  * A list of memory where objects are stored
  */
 STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
-
 /**
  * Callback used to free a memory chunk
  */
@@ -244,6 +246,7 @@ struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+	void *local_cache_base_addr; /**< Reference to the base value */
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
-	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = cache->local_cache_base_value;
+	uint32_t *cache_objs = cache->objs;
+	for (i = 0; i < cache_len; i++)
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
 	cache->len = 0;
 }
 
@@ -1289,7 +1300,9 @@ static __rte_always_inline void
 __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
 
 	/* increment stat now, adding in mempool always success */
 	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
@@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 	cache_objs = &cache->objs[cache->len];
 
+	base_value = cache->local_cache_base_value;
+
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
 	/*
 	 * The cache follows the following algorithm
 	 *   1. Add the objects to the cache
@@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
-	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+
+#if defined __ARM_NEON
+	for (i = 0; i < (n & ~0x1); i+=2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+	if (n & 0x1) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#else
+	for (i = 0; i < n; i++) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
-		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
 				cache->len - cache->size);
 		cache->len = cache->size;
 	}
@@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
 	int ret;
+	uint32_t i;
 	uint32_t index, len;
-	void **cache_objs;
+	uint32_t *cache_objs;
 
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+	void *base_value = cache->local_cache_base_value;
 	cache_objs = cache->objs;
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
 		/* No. Backfill the cache first, and then fill from it */
 		uint32_t req = n + (cache->size - cache->len);
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 
 		/* How many do we require i.e. number to fill the cache + the request */
 		ret = rte_mempool_ops_dequeue_bulk(mp,
-			&cache->objs[cache->len], req);
+			temp_objs, req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
+		}
+
 		cache->len += req;
 	}
 
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
 	/* Now fill in the response ... */
+#if defined __ARM_NEON
+	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
+						len-=2, obj_table+=2) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+	}
+	if (n & 0x1)
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
-		*obj_table = cache_objs[len];
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
 
 	cache->len -= n;
 
-- 
2.17.1


             reply	other threads:[~2021-09-30 17:27 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-30 17:27 Dharmik Thakkar [this message]
2021-10-01 12:36 ` Jerin Jacob
2021-10-01 15:44   ` Honnappa Nagarahalli
2021-10-01 17:32     ` Jerin Jacob
2021-10-01 17:57       ` Honnappa Nagarahalli
2021-10-01 18:21       ` Jerin Jacob
2021-10-01 21:30 ` Ananyev, Konstantin
2021-10-02  0:07   ` Honnappa Nagarahalli
2021-10-02 18:51     ` Ananyev, Konstantin
2021-10-04 16:36       ` Honnappa Nagarahalli
2021-10-30 10:23         ` Morten Brørup
2021-10-31  8:14         ` Morten Brørup
2021-11-03 15:12           ` Dharmik Thakkar
2021-11-03 15:52             ` Morten Brørup
2021-11-04  4:42               ` Dharmik Thakkar
2021-11-04  8:04                 ` Morten Brørup
2021-11-08  4:32                   ` Honnappa Nagarahalli
2021-11-08  7:22                     ` Morten Brørup
2021-11-08 15:29                       ` Honnappa Nagarahalli
2021-11-08 15:39                         ` Morten Brørup
2021-11-08 15:46                           ` Honnappa Nagarahalli
2021-11-08 16:03                             ` Morten Brørup
2021-11-08 16:47                               ` Jerin Jacob
2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
2021-12-24 22:59   ` [PATCH 1/1] " Dharmik Thakkar
2022-01-11  2:26     ` Ananyev, Konstantin
2022-01-13  5:17       ` Dharmik Thakkar
2022-01-13 10:37         ` Ananyev, Konstantin
2022-01-19 15:32           ` Dharmik Thakkar
2022-01-21 11:25             ` Ananyev, Konstantin
2022-01-21 11:31               ` Ananyev, Konstantin
2022-03-24 19:51               ` Dharmik Thakkar
2021-12-25  0:16   ` [PATCH 0/1] " Morten Brørup
2022-01-07 11:15     ` Bruce Richardson
2022-01-07 11:29       ` Morten Brørup
2022-01-07 13:50         ` Bruce Richardson
2022-01-08  9:37           ` Morten Brørup
2022-01-10  6:38             ` Jerin Jacob
2022-01-13  5:31               ` Dharmik Thakkar
2022-01-13  5:36   ` [PATCH v2 " Dharmik Thakkar
2022-01-13  5:36     ` [PATCH v2 1/1] " Dharmik Thakkar
2022-01-13 10:18       ` Jerin Jacob
2022-01-20  8:21       ` Morten Brørup
2022-01-21  6:01         ` Honnappa Nagarahalli
2022-01-21  7:36           ` Morten Brørup
2022-01-24 13:05             ` Ray Kinsella
2022-01-21  9:12           ` Bruce Richardson
2022-01-23  7:13       ` Wang, Haiyue

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210930172735.2675627-1-dharmik.thakkar@arm.com \
    --to=dharmik.thakkar@arm.com \
    --cc=andrew.rybchenko@oktetlabs.ru \
    --cc=dev@dpdk.org \
    --cc=honnappa.nagarahalli@arm.com \
    --cc=nd@arm.com \
    --cc=olivier.matz@6wind.com \
    --cc=ruifeng.wang@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git