From: Dharmik Thakkar <dharmik.thakkar@arm.com>
To: Olivier Matz <olivier.matz@6wind.com>,
Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Cc: dev@dpdk.org, nd@arm.com, honnappa.nagarahalli@arm.com,
ruifeng.wang@arm.com, Dharmik Thakkar <dharmik.thakkar@arm.com>
Subject: [PATCH 1/1] mempool: implement index-based per core cache
Date: Fri, 24 Dec 2021 16:59:23 -0600 [thread overview]
Message-ID: <20211224225923.806498-2-dharmik.thakkar@arm.com> (raw)
In-Reply-To: <20211224225923.806498-1-dharmik.thakkar@arm.com>
Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache
L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput
Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
lib/mempool/rte_mempool.h | 114 +++++++++++++++++++++++++-
lib/mempool/rte_mempool_ops_default.c | 7 ++
2 files changed, 119 insertions(+), 2 deletions(-)
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1e7a3c15273c..4fabd3b1920b 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -50,6 +50,10 @@
#include <rte_memcpy.h>
#include <rte_common.h>
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#include <rte_vect.h>
+#endif
+
#include "rte_mempool_trace_fp.h"
#ifdef __cplusplus
@@ -239,6 +243,9 @@ struct rte_mempool {
int32_t ops_index;
struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ void *pool_base_value; /**< Base value to calculate indices */
+#endif
uint32_t populated_size; /**< Number of populated objects. */
struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
if (cache == NULL || cache->len == 0)
return;
rte_mempool_trace_cache_flush(cache, mp);
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ unsigned int i;
+ unsigned int cache_len = cache->len;
+ void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+ void *base_value = mp->pool_base_value;
+ uint32_t *cache_objs = (uint32_t *) cache->objs;
+ for (i = 0; i < cache_len; i++)
+ obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+ rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
+#else
rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+#endif
cache->len = 0;
}
@@ -1334,8 +1353,13 @@ static __rte_always_inline void
rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
unsigned int n, struct rte_mempool_cache *cache)
{
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ uint32_t *cache_objs;
+ void *base_value;
+ uint32_t i;
+#else
void **cache_objs;
-
+#endif
/* increment stat now, adding in mempool always success */
RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
@@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
goto ring_enqueue;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ cache_objs = (uint32_t *) cache->objs;
+ cache_objs = &cache_objs[cache->len];
+ base_value = mp->pool_base_value;
+#else
cache_objs = &cache->objs[cache->len];
+#endif
/*
* The cache follows the following algorithm
@@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
*/
/* Add elements back into the cache */
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+ uint64x2_t v_obj_table;
+ uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+ uint32x2_t v_cache_objs;
+
+ for (i = 0; i < (n & ~0x1); i += 2) {
+ v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+ v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+ vst1_u32(cache_objs + i, v_cache_objs);
+ }
+ if (n & 0x1) {
+ cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+ }
+#else
+ for (i = 0; i < n; i++) {
+ cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+ }
+#endif
+#else
rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+#endif
cache->len += n;
if (cache->len >= cache->flushthresh) {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
+ cache->len - cache->size);
+#else
rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
cache->len - cache->size);
+#endif
cache->len = cache->size;
}
@@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
{
int ret;
uint32_t index, len;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ uint32_t i;
+ uint32_t *cache_objs;
+#else
void **cache_objs;
-
+#endif
/* No cache provided or cannot be satisfied from cache */
if (unlikely(cache == NULL || n >= cache->size))
goto ring_dequeue;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ void *base_value = mp->pool_base_value;
+ cache_objs = (uint32_t *) cache->objs;
+#else
cache_objs = cache->objs;
+#endif
/* Can this be satisfied from the cache? */
if (cache->len < n) {
@@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
uint32_t req = n + (cache->size - cache->len);
/* How many do we require i.e. number to fill the cache + the request */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+ ret = rte_mempool_ops_dequeue_bulk(mp,
+ temp_objs, req);
+#else
ret = rte_mempool_ops_dequeue_bulk(mp,
&cache->objs[cache->len], req);
+#endif
if (unlikely(ret < 0)) {
/*
* In the off chance that we are buffer constrained,
@@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
goto ring_dequeue;
}
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ len = cache->len;
+ for (i = 0; i < req; ++i, ++len) {
+ cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
+ base_value);
+ }
+#endif
cache->len += req;
}
/* Now fill in the response ... */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+ uint64x2_t v_obj_table;
+ uint64x2_t v_cache_objs;
+ uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
+ for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
+ len -= 4, obj_table += 4) {
+ v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+ v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+ vst1q_u64((uint64_t *)obj_table, v_obj_table);
+ v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
+ v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+ vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
+ }
+ switch (n & 0x3) {
+ case 3:
+ *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+ /* fallthrough */
+ case 2:
+ *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+ /* fallthrough */
+ case 1:
+ *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+ }
+#else
+ for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
+ *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
+#else
for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
*obj_table = cache_objs[len];
+#endif
cache->len -= n;
diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
index 22fccf9d7619..3543cad9d4ce 100644
--- a/lib/mempool/rte_mempool_ops_default.c
+++ b/lib/mempool/rte_mempool_ops_default.c
@@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
obj = va + off;
obj_cb(mp, obj_cb_arg, obj,
(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ /* Store pool base value to calculate indices for index-based
+ * lcore cache implementation
+ */
+ if (i == 0)
+ mp->pool_base_value = obj;
+#endif
rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
off += mp->elt_size + mp->trailer_size;
}
--
2.25.1
next prev parent reply other threads:[~2021-12-24 23:00 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-30 17:27 [dpdk-dev] [RFC] " Dharmik Thakkar
2021-10-01 12:36 ` Jerin Jacob
2021-10-01 15:44 ` Honnappa Nagarahalli
2021-10-01 17:32 ` Jerin Jacob
2021-10-01 17:57 ` Honnappa Nagarahalli
2021-10-01 18:21 ` Jerin Jacob
2021-10-01 21:30 ` Ananyev, Konstantin
2021-10-02 0:07 ` Honnappa Nagarahalli
2021-10-02 18:51 ` Ananyev, Konstantin
2021-10-04 16:36 ` Honnappa Nagarahalli
2021-10-30 10:23 ` Morten Brørup
2021-10-31 8:14 ` Morten Brørup
2021-11-03 15:12 ` Dharmik Thakkar
2021-11-03 15:52 ` Morten Brørup
2021-11-04 4:42 ` Dharmik Thakkar
2021-11-04 8:04 ` Morten Brørup
2021-11-08 4:32 ` Honnappa Nagarahalli
2021-11-08 7:22 ` Morten Brørup
2021-11-08 15:29 ` Honnappa Nagarahalli
2021-11-08 15:39 ` Morten Brørup
2021-11-08 15:46 ` Honnappa Nagarahalli
2021-11-08 16:03 ` Morten Brørup
2021-11-08 16:47 ` Jerin Jacob
2021-12-24 22:59 ` [PATCH 0/1] " Dharmik Thakkar
2021-12-24 22:59 ` Dharmik Thakkar [this message]
2022-01-11 2:26 ` [PATCH 1/1] " Ananyev, Konstantin
2022-01-13 5:17 ` Dharmik Thakkar
2022-01-13 10:37 ` Ananyev, Konstantin
2022-01-19 15:32 ` Dharmik Thakkar
2022-01-21 11:25 ` Ananyev, Konstantin
2022-01-21 11:31 ` Ananyev, Konstantin
2022-03-24 19:51 ` Dharmik Thakkar
2021-12-25 0:16 ` [PATCH 0/1] " Morten Brørup
2022-01-07 11:15 ` Bruce Richardson
2022-01-07 11:29 ` Morten Brørup
2022-01-07 13:50 ` Bruce Richardson
2022-01-08 9:37 ` Morten Brørup
2022-01-10 6:38 ` Jerin Jacob
2022-01-13 5:31 ` Dharmik Thakkar
2023-07-06 17:43 ` Stephen Hemminger
2023-07-31 12:23 ` Thomas Monjalon
2023-07-31 12:33 ` Morten Brørup
2023-07-31 14:57 ` Dharmik Jayesh Thakkar
2022-01-13 5:36 ` [PATCH v2 " Dharmik Thakkar
2022-01-13 5:36 ` [PATCH v2 1/1] " Dharmik Thakkar
2022-01-13 10:18 ` Jerin Jacob
2022-01-20 8:21 ` Morten Brørup
2022-01-21 6:01 ` Honnappa Nagarahalli
2022-01-21 7:36 ` Morten Brørup
2022-01-24 13:05 ` Ray Kinsella
2022-01-21 9:12 ` Bruce Richardson
2022-01-23 7:13 ` Wang, Haiyue
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211224225923.806498-2-dharmik.thakkar@arm.com \
--to=dharmik.thakkar@arm.com \
--cc=andrew.rybchenko@oktetlabs.ru \
--cc=dev@dpdk.org \
--cc=honnappa.nagarahalli@arm.com \
--cc=nd@arm.com \
--cc=olivier.matz@6wind.com \
--cc=ruifeng.wang@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).