[RFC PATCH] mempool: obey configured cache size

DPDK patches and discussions
 help / color / mirror / Atom feed

* [RFC PATCH] mempool: obey configured cache size
@ 2024-09-20 16:32 Morten Brørup
  2024-09-20 16:37 ` [RFC PATCH v2] " Morten Brørup
                   ` (18 more replies)
  0 siblings, 19 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-20 16:32 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

Seeking feedback on the concept.
I have not yet benchmarked performance.

The mempool cache size is configurable, but it could hold 1.5 times the
configured size.
This was confusing for developers, and added complexity when configuring
mempools with caches.

This patch modifies the mempool cache to obey the configured size, and
removes the cache flush threshold.

Furthermore, the mempool caches are now completely flushed/filled to/from
the backend, so backend accesses are CPU cache line aligned.

Finallly, the mempool get and put functions are optimized to only
inline the likely scenarios, and call a non-inline static helper function
in other cases.

Variuos drivers accessing the mempool directly have been updated
accordingly.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 app/test/test_mempool_perf.c                  | 146 +++++---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 337 +++++++++++-------
 10 files changed, 353 insertions(+), 298 deletions(-)

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index 55e17cce47..4dd74ef75a 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -54,22 +54,25 @@
  *
  *    - Bulk size (*n_get_bulk*, *n_put_bulk*)
  *
- *      - Bulk get from 1 to 32
- *      - Bulk put from 1 to 32
- *      - Bulk get and put from 1 to 32, compile time constant
+ *      - Bulk get from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE
+ *      - Bulk put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE
+ *      - Bulk get and put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE, compile time constant
  *
  *    - Number of kept objects (*n_keep*)
  *
  *      - 32
  *      - 128
  *      - 512
+ *      - 2048
+ *      - 8192
+ *      - 32768
  */
 
-#define N 65536
-#define TIME_S 5
+#define TIME_S 1
 #define MEMPOOL_ELT_SIZE 2048
-#define MAX_KEEP 512
-#define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
+#define MAX_KEEP 32768
+#define N (128 * MAX_KEEP)
+#define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE*2))-1)
 
 /* Number of pointers fitting into one cache line. */
 #define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
@@ -100,9 +103,11 @@ static unsigned n_keep;
 /* true if we want to test with constant n_get_bulk and n_put_bulk */
 static int use_constant_values;
 
-/* number of enqueues / dequeues */
+/* number of enqueues / dequeues, and time used */
 struct __rte_cache_aligned mempool_test_stats {
 	uint64_t enq_count;
+	uint64_t duration_cycles;
+	RTE_CACHE_GUARD;
 };
 
 static struct mempool_test_stats stats[RTE_MAX_LCORE];
@@ -185,6 +190,7 @@ per_lcore_mempool_test(void *arg)
 		GOTO_ERR(ret, out);
 
 	stats[lcore_id].enq_count = 0;
+	stats[lcore_id].duration_cycles = 0;
 
 	/* wait synchro for workers */
 	if (lcore_id != rte_get_main_lcore())
@@ -205,6 +211,15 @@ per_lcore_mempool_test(void *arg)
 					CACHE_LINE_BURST, CACHE_LINE_BURST);
 		else if (n_get_bulk == 32)
 			ret = test_loop(mp, cache, n_keep, 32, 32);
+		else if (n_get_bulk == 64)
+			ret = test_loop(mp, cache, n_keep, 64, 64);
+		else if (n_get_bulk == 128)
+			ret = test_loop(mp, cache, n_keep, 128, 128);
+		else if (n_get_bulk == 256)
+			ret = test_loop(mp, cache, n_keep, 256, 256);
+		else if (n_get_bulk == RTE_MEMPOOL_CACHE_MAX_SIZE)
+			ret = test_loop(mp, cache, n_keep,
+					RTE_MEMPOOL_CACHE_MAX_SIZE, RTE_MEMPOOL_CACHE_MAX_SIZE);
 		else
 			ret = -1;
 
@@ -216,6 +231,8 @@ per_lcore_mempool_test(void *arg)
 		stats[lcore_id].enq_count += N;
 	}
 
+	stats[lcore_id].duration_cycles = time_diff;
+
 out:
 	if (use_external_cache) {
 		rte_mempool_cache_flush(cache, mp);
@@ -233,6 +250,7 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 	uint64_t rate;
 	int ret;
 	unsigned cores_save = cores;
+	double hz = rte_get_timer_hz();
 
 	rte_atomic_store_explicit(&synchro, 0, rte_memory_order_relaxed);
 
@@ -279,7 +297,9 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 
 	rate = 0;
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
-		rate += (stats[lcore_id].enq_count / TIME_S);
+		if (stats[lcore_id].duration_cycles != 0)
+			rate += (double)stats[lcore_id].enq_count * hz /
+					(double)stats[lcore_id].duration_cycles;
 
 	printf("rate_persec=%" PRIu64 "\n", rate);
 
@@ -288,11 +308,13 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 
 /* for a given number of core, launch all test cases */
 static int
-do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
+do_one_mempool_test(struct rte_mempool *mp, unsigned int cores, int external_cache)
 {
-	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
-	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
-	unsigned int keep_tab[] = { 32, 128, 512, 0 };
+	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, 256,
+			RTE_MEMPOOL_CACHE_MAX_SIZE, 0 };
+	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, 256,
+			RTE_MEMPOOL_CACHE_MAX_SIZE, 0 };
+	unsigned int keep_tab[] = { 32, 128, 512, 2048, 8192, 32768, 0 };
 	unsigned *get_bulk_ptr;
 	unsigned *put_bulk_ptr;
 	unsigned *keep_ptr;
@@ -302,6 +324,10 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 		for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++) {
 			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
 
+				if (*keep_ptr < *get_bulk_ptr || *keep_ptr < *put_bulk_ptr)
+					continue;
+
+				use_external_cache = external_cache;
 				use_constant_values = 0;
 				n_get_bulk = *get_bulk_ptr;
 				n_put_bulk = *put_bulk_ptr;
@@ -324,7 +350,7 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 }
 
 static int
-test_mempool_perf(void)
+do_all_mempool_perf_tests(unsigned int cores)
 {
 	struct rte_mempool *mp_cache = NULL;
 	struct rte_mempool *mp_nocache = NULL;
@@ -338,8 +364,10 @@ test_mempool_perf(void)
 					NULL, NULL,
 					my_obj_init, NULL,
 					SOCKET_ID_ANY, 0);
-	if (mp_nocache == NULL)
+	if (mp_nocache == NULL) {
+		printf("cannot allocate mempool (without cache)\n");
 		goto err;
+	}
 
 	/* create a mempool (with cache) */
 	mp_cache = rte_mempool_create("perf_test_cache", MEMPOOL_SIZE,
@@ -348,8 +376,10 @@ test_mempool_perf(void)
 				      NULL, NULL,
 				      my_obj_init, NULL,
 				      SOCKET_ID_ANY, 0);
-	if (mp_cache == NULL)
+	if (mp_cache == NULL) {
+		printf("cannot allocate mempool (with cache)\n");
 		goto err;
+	}
 
 	default_pool_ops = rte_mbuf_best_mempool_ops();
 	/* Create a mempool based on Default handler */
@@ -377,65 +407,83 @@ test_mempool_perf(void)
 
 	rte_mempool_obj_iter(default_pool, my_obj_init, NULL);
 
-	/* performance test with 1, 2 and max cores */
 	printf("start performance test (without cache)\n");
-
-	if (do_one_mempool_test(mp_nocache, 1) < 0)
-		goto err;
-
-	if (do_one_mempool_test(mp_nocache, 2) < 0)
+	if (do_one_mempool_test(mp_nocache, cores, 0) < 0)
 		goto err;
 
-	if (do_one_mempool_test(mp_nocache, rte_lcore_count()) < 0)
-		goto err;
-
-	/* performance test with 1, 2 and max cores */
 	printf("start performance test for %s (without cache)\n",
 	       default_pool_ops);
-
-	if (do_one_mempool_test(default_pool, 1) < 0)
+	if (do_one_mempool_test(default_pool, cores, 0) < 0)
 		goto err;
 
-	if (do_one_mempool_test(default_pool, 2) < 0)
+	printf("start performance test (with cache)\n");
+	if (do_one_mempool_test(mp_cache, cores, 0) < 0)
 		goto err;
 
-	if (do_one_mempool_test(default_pool, rte_lcore_count()) < 0)
+	printf("start performance test (with user-owned cache)\n");
+	if (do_one_mempool_test(mp_nocache, cores, 1) < 0)
 		goto err;
 
-	/* performance test with 1, 2 and max cores */
-	printf("start performance test (with cache)\n");
+	rte_mempool_list_dump(stdout);
 
-	if (do_one_mempool_test(mp_cache, 1) < 0)
-		goto err;
+	ret = 0;
 
-	if (do_one_mempool_test(mp_cache, 2) < 0)
-		goto err;
+err:
+	rte_mempool_free(mp_cache);
+	rte_mempool_free(mp_nocache);
+	rte_mempool_free(default_pool);
+	return ret;
+}
 
-	if (do_one_mempool_test(mp_cache, rte_lcore_count()) < 0)
-		goto err;
+static int
+test_mempool_perf_1core(void)
+{
+	return do_all_mempool_perf_tests(1);
+}
 
-	/* performance test with 1, 2 and max cores */
-	printf("start performance test (with user-owned cache)\n");
-	use_external_cache = 1;
+static int
+test_mempool_perf_2cores(void)
+{
+	if (rte_lcore_count() < 2) {
+		printf("not enough lcores\n");
+		return -1;
+	}
+	return do_all_mempool_perf_tests(2);
+}
 
-	if (do_one_mempool_test(mp_nocache, 1) < 0)
-		goto err;
+static int
+test_mempool_perf_allcores(void)
+{
+	return do_all_mempool_perf_tests(rte_lcore_count());
+}
+
+static int
+test_mempool_perf(void)
+{
+	int ret = -1;
 
-	if (do_one_mempool_test(mp_nocache, 2) < 0)
+	/* performance test with 1, 2 and max cores */
+	if (do_all_mempool_perf_tests(1) < 0)
 		goto err;
+	if (rte_lcore_count() == 1)
+		goto done;
 
-	if (do_one_mempool_test(mp_nocache, rte_lcore_count()) < 0)
+	if (do_all_mempool_perf_tests(2) < 0)
 		goto err;
+	if (rte_lcore_count() == 2)
+		goto done;
 
-	rte_mempool_list_dump(stdout);
+	if (do_all_mempool_perf_tests(rte_lcore_count()) < 0)
+		goto err;
 
+done:
 	ret = 0;
 
 err:
-	rte_mempool_free(mp_cache);
-	rte_mempool_free(mp_nocache);
-	rte_mempool_free(default_pool);
 	return ret;
 }
 
 REGISTER_PERF_TEST(mempool_perf_autotest, test_mempool_perf);
+REGISTER_PERF_TEST(mempool_perf_autotest_1core, test_mempool_perf_1core);
+REGISTER_PERF_TEST(mempool_perf_autotest_2cores, test_mempool_perf_2cores);
+REGISTER_PERF_TEST(mempool_perf_autotest_allcores, test_mempool_perf_allcores);
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..de1d49bac3 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1363,7 +1361,8 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 }
 
 /**
- * @internal Put several objects back in the mempool; used internally.
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
@@ -1371,58 +1370,94 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  * @param n
  *   The number of objects to store back in the mempool, must be strictly
  *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  */
-static __rte_always_inline void
-rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static __rte_noinline void
+rte_mempool_do_generic_put_many(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
+	__attribute__((assume(cache != NULL)));
+	__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(cache->len + n > cache->size)));
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
-		goto driver_enqueue;
+	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stat now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	if (unlikely(n > cache_size)) {
+		/* Push following objects, in entire cache sizes, directly to the backend. */
+		len = n - n % cache_size;
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, len);
+		obj_table += len;
+		n -= len;
 	}
 
-	/* Add the objects to the cache. */
+	/* Add the remaining objects to the cache. */
+	cache->len = n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
 
-	return;
-
-driver_enqueue:
-
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
+/**
+ * @internal Put several objects back in the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ */
+static __rte_always_inline void
+rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+		__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+
+		/* Enough remaining space in the cache? */
+		if (likely(cache->len + n <= cache->size)) {
+			void **cache_objs;
+
+			/* Increment stat now, adding in mempool always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
+			/* Add the objects to the cache. */
+			cache_objs = &cache->objs[cache->len];
+			cache->len += n;
+			rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		} else
+			rte_mempool_do_generic_put_many(mp, obj_table, n, cache);
+	} else {
+		/* Increment stat now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* push objects to the backend */
-	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+		/* push objects to the backend */
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+	}
 }
 
 
@@ -1490,135 +1525,193 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static __rte_noinline int
+rte_mempool_do_generic_get_many(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
+	__attribute__((assume(cache != NULL)));
+	__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(n > cache->len)));
+
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
-	}
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	cache->len = cache->size;
+			return 0;
+		}
 
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
+	}
+
+	/* Increment stat now, this always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
+	for (index = 0; index < remaining; index++)
+		*obj_table++ = *--cache_objs;
+
+	cache->len = cache_size - remaining;
+
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+		__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+
+		/* Enough objects in the cache? */
+		if (n <= cache->len) {
+			unsigned int index;
+			void **cache_objs;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+			/* Increment stat now, this always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
 			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
 			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_many(mp, obj_table, n, cache);
 	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v2] mempool: obey configured cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
@ 2024-09-20 16:37 ` Morten Brørup
  2024-09-20 17:13 ` [RFC PATCH v3] " Morten Brørup
                   ` (17 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-20 16:37 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

Seeking feedback on the concept.
I have not yet benchmarked performance.

The mempool cache size is configurable, but it could hold 1.5 times the
configured size.
This was confusing for developers, and added complexity when configuring
mempools with caches.

This patch modifies the mempool cache to obey the configured size, and
removes the cache flush threshold.

Furthermore, the mempool caches are now completely flushed/filled to/from
the backend, so backend accesses are CPU cache line aligned.

Finallly, the mempool get and put functions are optimized to only
inline the likely scenarios, and call a non-inline static helper function
in other cases.

Variuos drivers accessing the mempool directly have been updated
accordingly.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 337 +++++++++++-------
 9 files changed, 256 insertions(+), 249 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..de1d49bac3 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1363,7 +1361,8 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 }
 
 /**
- * @internal Put several objects back in the mempool; used internally.
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
@@ -1371,58 +1370,94 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  * @param n
  *   The number of objects to store back in the mempool, must be strictly
  *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  */
-static __rte_always_inline void
-rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static __rte_noinline void
+rte_mempool_do_generic_put_many(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
+	__attribute__((assume(cache != NULL)));
+	__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(cache->len + n > cache->size)));
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
-		goto driver_enqueue;
+	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stat now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	if (unlikely(n > cache_size)) {
+		/* Push following objects, in entire cache sizes, directly to the backend. */
+		len = n - n % cache_size;
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, len);
+		obj_table += len;
+		n -= len;
 	}
 
-	/* Add the objects to the cache. */
+	/* Add the remaining objects to the cache. */
+	cache->len = n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
 
-	return;
-
-driver_enqueue:
-
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
+/**
+ * @internal Put several objects back in the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ */
+static __rte_always_inline void
+rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+		__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+
+		/* Enough remaining space in the cache? */
+		if (likely(cache->len + n <= cache->size)) {
+			void **cache_objs;
+
+			/* Increment stat now, adding in mempool always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
+			/* Add the objects to the cache. */
+			cache_objs = &cache->objs[cache->len];
+			cache->len += n;
+			rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		} else
+			rte_mempool_do_generic_put_many(mp, obj_table, n, cache);
+	} else {
+		/* Increment stat now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* push objects to the backend */
-	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+		/* push objects to the backend */
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+	}
 }
 
 
@@ -1490,135 +1525,193 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static __rte_noinline int
+rte_mempool_do_generic_get_many(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
+	__attribute__((assume(cache != NULL)));
+	__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+	__attribute__((assume(n > cache->len)));
+
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
-	}
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	cache->len = cache->size;
+			return 0;
+		}
 
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
+	}
+
+	/* Increment stat now, this always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
+	for (index = 0; index < remaining; index++)
+		*obj_table++ = *--cache_objs;
+
+	cache->len = cache_size - remaining;
+
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		__attribute__((assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+		__attribute__((assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE)));
+
+		/* Enough objects in the cache? */
+		if (n <= cache->len) {
+			unsigned int index;
+			void **cache_objs;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+			/* Increment stat now, this always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
 			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
 			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_many(mp, obj_table, n, cache);
 	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v3] mempool: obey configured cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
  2024-09-20 16:37 ` [RFC PATCH v2] " Morten Brørup
@ 2024-09-20 17:13 ` Morten Brørup
  2024-09-20 19:41   ` Mattias Rönnblom
  2024-09-22 10:50 ` [RFC PATCH v4] mempool: fix mempool " Morten Brørup
                   ` (16 subsequent siblings)
  18 siblings, 1 reply; 22+ messages in thread
From: Morten Brørup @ 2024-09-20 17:13 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

Seeking feedback on the concept.
I have not yet benchmarked performance.

The mempool cache size is configurable, but it could hold 1.5 times the
configured size.
This was confusing for developers, and added complexity when configuring
mempools with caches.

This patch modifies the mempool cache to obey the configured size, and
removes the cache flush threshold.

Furthermore, the mempool caches are now completely flushed/filled to/from
the backend, so backend accesses are CPU cache line aligned.

Finallly, the mempool get and put functions are optimized to only
inline the likely scenarios, and call a non-inline static helper function
in other cases.

Variuos drivers accessing the mempool directly have been updated
accordingly.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 321 +++++++++++-------
 9 files changed, 240 insertions(+), 249 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..580c655eb3 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1363,7 +1361,8 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 }
 
 /**
- * @internal Put several objects back in the mempool; used internally.
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
@@ -1371,58 +1370,86 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  * @param n
  *   The number of objects to store back in the mempool, must be strictly
  *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  */
-static __rte_always_inline void
-rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static __rte_noinline void
+rte_mempool_do_generic_put_many(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
 	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
-		goto driver_enqueue;
-
-	/* increment stat now, adding in mempool always success */
+	/* Increment stat now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	if (unlikely(n > cache_size)) {
+		/* Push following objects, in entire cache sizes, directly to the backend. */
+		len = n - n % cache_size;
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, len);
+		obj_table += len;
+		n -= len;
 	}
 
-	/* Add the objects to the cache. */
+	/* Add the remaining objects to the cache. */
+	cache->len = n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
 
-	return;
-
-driver_enqueue:
-
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
+/**
+ * @internal Put several objects back in the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ */
+static __rte_always_inline void
+rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		/* Enough remaining space in the cache? */
+		if (likely(cache->len + n <= cache->size)) {
+			void **cache_objs;
+
+			/* Increment stat now, adding in mempool always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
+			/* Add the objects to the cache. */
+			cache_objs = &cache->objs[cache->len];
+			cache->len += n;
+			rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		} else
+			rte_mempool_do_generic_put_many(mp, obj_table, n, cache);
+	} else {
+		/* Increment stat now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* push objects to the backend */
-	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+		/* push objects to the backend */
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+	}
 }
 
 
@@ -1490,135 +1517,185 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static __rte_noinline int
+rte_mempool_do_generic_get_many(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
-	}
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
 
-	cache->len = cache->size;
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
+	}
 
+	/* Increment stat now, this always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
+	for (index = 0; index < remaining; index++)
+		*obj_table++ = *--cache_objs;
+
+	cache->len = cache_size - remaining;
+
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		/* Enough objects in the cache? */
+		if (n <= cache->len) {
+			unsigned int index;
+			void **cache_objs;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+			/* Increment stat now, this always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
 			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
 			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_many(mp, obj_table, n, cache);
 	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH v3] mempool: obey configured cache size
  2024-09-20 17:13 ` [RFC PATCH v3] " Morten Brørup
@ 2024-09-20 19:41   ` Mattias Rönnblom
  0 siblings, 0 replies; 22+ messages in thread
From: Mattias Rönnblom @ 2024-09-20 19:41 UTC (permalink / raw)
  To: Morten Brørup, dev

On 2024-09-20 19:13, Morten Brørup wrote:
> Seeking feedback on the concept.
> I have not yet benchmarked performance.
> 
> The mempool cache size is configurable, but it could hold 1.5 times the
> configured size.
> This was confusing for developers, and added complexity when configuring
> mempools with caches.
> 

Is there an upside to the current semantics? Are non-power-2 
rings/stacks more costly to operate against?

> This patch modifies the mempool cache to obey the configured size, and
> removes the cache flush threshold.
> 

Maybe it would be worth mentioning what was the original purpose of the 
threshold (if known), and why it's no longer a good idea (if it ever was).

> Furthermore, the mempool caches are now completely flushed/filled to/from
> the backend, so backend accesses are CPU cache line aligned.
> 
> Finallly, the mempool get and put functions are optimized to only
> inline the likely scenarios, and call a non-inline static helper function
> in other cases.

What do you mean by "inline" here? Not in the header file, not marked 
"inline", or something else. (I've read the code so I know what you 
mean, but it should preferably be clear already in the commit message.)

Being in the header file does force the compiler to inline functions, 
and being in a .c file does not prevent inlining, in case LTO is used.

> 
> Variuos drivers accessing the mempool directly have been updated
> accordingly.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v3:
> * Removed __attribute__(assume).
> v2:
> * Removed mempool perf test; not part of patch set.
> ---
>   drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +--
>   drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
>   drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
>   drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
>   drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
>   drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
>   lib/mempool/mempool_trace.h                   |   1 -
>   lib/mempool/rte_mempool.c                     |  12 +-
>   lib/mempool/rte_mempool.h                     | 321 +++++++++++-------
>   9 files changed, 240 insertions(+), 249 deletions(-)
> 
> diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
> index 3b5e124ec8..98535a48f3 100644
> --- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
> +++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
> @@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
>   								rte_lcore_id());
>   		void **cache_objs;
>   
> -		if (cache == NULL || cache->len == 0)
> -			goto normal;
> -
> -		cache_objs = &cache->objs[cache->len];
> -
> -		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +		if (!cache || unlikely(n + cache->len > cache->size)) {
> +			rte_mempool_generic_put(mp, (void *)txep, n, cache);
>   			goto done;
>   		}
>   
> -		/* The cache follows the following algorithm
> -		 *   1. Add the objects to the cache
> -		 *   2. Anything greater than the cache min value (if it crosses the
> -		 *   cache flush threshold) is flushed to the ring.
> -		 */
> +		cache_objs = &cache->objs[cache->len];
> +
>   		/* Add elements back into the cache */
>   		uint32_t copied = 0;
>   		/* n is multiple of 32 */
> @@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
>   		}
>   		cache->len += n;
>   
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk(mp,
> -						     &cache->objs[cache->size],
> -						     cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
> +		/* Increment stat. */
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>   		goto done;
>   	}
>   
> -normal:
>   	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>   	if (likely(m != NULL)) {
>   		free[0] = m;
> @@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
>   								rte_lcore_id());
>   		void **cache_objs;
>   
> -		if (!cache || cache->len == 0)
> -			goto normal;
> -
> -		cache_objs = &cache->objs[cache->len];
> -
> -		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +		if (!cache || unlikely(n + cache->len > cache->size)) {
> +			rte_mempool_generic_put(mp, (void *)txep, n, cache);
>   			goto done;
>   		}
>   
> -		/* The cache follows the following algorithm
> -		 *   1. Add the objects to the cache
> -		 *   2. Anything greater than the cache min value (if it crosses the
> -		 *   cache flush threshold) is flushed to the ring.
> -		 */
> +		cache_objs = &cache->objs[cache->len];
> +
>   		/* Add elements back into the cache */
>   		uint32_t copied = 0;
>   		/* n is multiple of 32 */
> @@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
>   		}
>   		cache->len += n;
>   
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk(mp,
> -						     &cache->objs[cache->size],
> -						     cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
> +		/* Increment stat. */
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>   		goto done;
>   	}
>   
> -normal:
>   	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>   	if (likely(m)) {
>   		free[0] = m;
> diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
> index 74bfcab509..3a936826c8 100644
> --- a/drivers/mempool/dpaa/dpaa_mempool.c
> +++ b/drivers/mempool/dpaa/dpaa_mempool.c
> @@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
>   	struct bman_pool_params params = {
>   		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
>   	};
> -	unsigned int lcore_id;
> -	struct rte_mempool_cache *cache;
>   
>   	MEMPOOL_INIT_FUNC_TRACE();
>   
> @@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
>   	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
>   		   sizeof(struct dpaa_bp_info));
>   	mp->pool_data = (void *)bp_info;
> -	/* Update per core mempool cache threshold to optimal value which is
> -	 * number of buffers that can be released to HW buffer pool in
> -	 * a single API call.
> -	 */
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> -		cache = &mp->local_cache[lcore_id];
> -		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
> -			lcore_id, cache->flushthresh,
> -			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
> -		if (cache->flushthresh)
> -			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
> -	}
>   
>   	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
>   	return 0;
> @@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
>   	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
>   			     count, bp_info->bpid);
>   
> -	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
> +	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
>   		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
>   				 count);
>   		return -1;
> diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> index 42e17d984c..a44f3cf616 100644
> --- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> +++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> @@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
>   	struct dpaa2_bp_info *bp_info;
>   	struct dpbp_attr dpbp_attr;
>   	uint32_t bpid;
> -	unsigned int lcore_id;
> -	struct rte_mempool_cache *cache;
>   	int ret;
>   
>   	avail_dpbp = dpaa2_alloc_dpbp_dev();
> @@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
>   	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
>   
>   	h_bp_list = bp_list;
> -	/* Update per core mempool cache threshold to optimal value which is
> -	 * number of buffers that can be released to HW buffer pool in
> -	 * a single API call.
> -	 */
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> -		cache = &mp->local_cache[lcore_id];
> -		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
> -			lcore_id, cache->flushthresh,
> -			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
> -		if (cache->flushthresh)
> -			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
> -	}
>   
>   	return 0;
>   err3:
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> index 0238b03f8a..712ab1726f 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> @@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
>   		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
>   				rte_lcore_id());
>   
> -		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> +		if (!cache || unlikely(n + cache->len > cache->size)) {
>   			rte_mempool_generic_put(mp, (void *)txep, n, cache);
>   			goto done;
>   		}
>   
>   		cache_objs = &cache->objs[cache->len];
>   
> -		/* The cache follows the following algorithm
> -		 *   1. Add the objects to the cache
> -		 *   2. Anything greater than the cache min value (if it
> -		 *   crosses the cache flush threshold) is flushed to the ring.
> -		 */
>   		/* Add elements back into the cache */
>   		uint32_t copied = 0;
>   		/* n is multiple of 32 */
> @@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
>   		}
>   		cache->len += n;
>   
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk
> -				(mp, &cache->objs[cache->size],
> -				cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
> +		/* Increment stat. */
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>   		goto done;
>   	}
>   
> diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> index 3bb6f305df..307bb8556a 100644
> --- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> +++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> @@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
>   								rte_lcore_id());
>   		void **cache_objs;
>   
> -		if (!cache || cache->len == 0)
> -			goto normal;
> -
> -		cache_objs = &cache->objs[cache->len];
> -
> -		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +		if (!cache || unlikely(n + cache->len > cache->size)) {
> +			rte_mempool_generic_put(mp, (void *)txep, n, cache);
>   			goto done;
>   		}
>   
> -		/* The cache follows the following algorithm
> -		 *   1. Add the objects to the cache
> -		 *   2. Anything greater than the cache min value (if it crosses the
> -		 *   cache flush threshold) is flushed to the ring.
> -		 */
> +		cache_objs = &cache->objs[cache->len];
> +
>   		/* Add elements back into the cache */
>   		uint32_t copied = 0;
>   		/* n is multiple of 32 */
> @@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
>   		}
>   		cache->len += n;
>   
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk(mp,
> -						     &cache->objs[cache->size],
> -						     cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
> +		/* Increment stat. */
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>   		goto done;
>   	}
>   
> -normal:
>   	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>   	if (likely(m)) {
>   		free[0] = m;
> diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
> index 04148e8ea2..4ea1db734e 100644
> --- a/drivers/net/ice/ice_rxtx_vec_avx512.c
> +++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
> @@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
>   		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
>   				rte_lcore_id());
>   
> -		if (!cache || cache->len == 0)
> -			goto normal;
> -
> -		cache_objs = &cache->objs[cache->len];
> -
> -		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +		if (!cache || unlikely(n + cache->len > cache->size)) {
> +			rte_mempool_generic_put(mp, (void *)txep, n, cache);
>   			goto done;
>   		}
>   
> -		/* The cache follows the following algorithm
> -		 *   1. Add the objects to the cache
> -		 *   2. Anything greater than the cache min value (if it
> -		 *   crosses the cache flush threshold) is flushed to the ring.
> -		 */
> +		cache_objs = &cache->objs[cache->len];
> +
>   		/* Add elements back into the cache */
>   		uint32_t copied = 0;
>   		/* n is multiple of 32 */
> @@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
>   		}
>   		cache->len += n;
>   
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk
> -				(mp, &cache->objs[cache->size],
> -				 cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
> +		/* Increment stat. */
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>   		goto done;
>   	}
>   
> -normal:
>   	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>   	if (likely(m)) {
>   		free[0] = m;
> diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
> index dffef062e4..3c49b41a6d 100644
> --- a/lib/mempool/mempool_trace.h
> +++ b/lib/mempool/mempool_trace.h
> @@ -112,7 +112,6 @@ RTE_TRACE_POINT(
>   	rte_trace_point_emit_i32(socket_id);
>   	rte_trace_point_emit_ptr(cache);
>   	rte_trace_point_emit_u32(cache->len);
> -	rte_trace_point_emit_u32(cache->flushthresh);
>   )
>   
>   RTE_TRACE_POINT(
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index d8e39e5c20..40fb13239a 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -50,11 +50,6 @@ static void
>   mempool_event_callback_invoke(enum rte_mempool_event event,
>   			      struct rte_mempool *mp);
>   
> -/* Note: avoid using floating point since that compiler
> - * may not think that is constant.
> - */
> -#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
> -
>   #if defined(RTE_ARCH_X86)
>   /*
>    * return the greatest common divisor between a and b (fast algorithm)
> @@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
>   static void
>   mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
>   {
> -	/* Check that cache have enough space for flush threshold */
> -	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
> +	/* Check that cache have enough space for size */
> +	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
>   			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
>   			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
>   
>   	cache->size = size;
> -	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
>   	cache->len = 0;
>   }
>   
> @@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
>   
>   	/* asked cache too big */
>   	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
> -	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
> +	    cache_size > n) {
>   		rte_errno = EINVAL;
>   		return NULL;
>   	}
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 7bdc92b812..580c655eb3 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
>    */
>   struct __rte_cache_aligned rte_mempool_cache {
>   	uint32_t size;	      /**< Size of the cache */
> -	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>   	uint32_t len;	      /**< Current cache count */
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> -	uint32_t unused;
>   	/*
>   	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
>   	 * providing faster update access when using a mempool cache.
> @@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
>   	 * Cache is allocated to this size to allow it to overflow in certain
>   	 * cases to avoid needless emptying of cache.
>   	 */
> -	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
> +	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
>   };
>   
>   /**
> @@ -1363,7 +1361,8 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>   }
>   
>   /**
> - * @internal Put several objects back in the mempool; used internally.
> + * @internal Put several objects back in the mempool; used internally when
> + *   the number of objects exceeds the remaining space in the mempool cache.
>    * @param mp
>    *   A pointer to the mempool structure.
>    * @param obj_table
> @@ -1371,58 +1370,86 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>    * @param n
>    *   The number of objects to store back in the mempool, must be strictly
>    *   positive.
> + *   Must be more than the remaining space in the mempool cache, i.e.:
> + *   cache->len + n > cache->size
>    * @param cache
> - *   A pointer to a mempool cache structure. May be NULL if not needed.
> + *   A pointer to a mempool cache structure. Not NULL.
>    */
> -static __rte_always_inline void
> -rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> -			   unsigned int n, struct rte_mempool_cache *cache)
> +static __rte_noinline void

In a sense this is the slow path, but it will still be run quite often, 
will it not? You could just mark it __rte_unused and let the compiler 
decide what to do, or maybe better, move it to the .c file if you really 
don't think a function call proper is a performance issue.

With __rte_noinline you will prevent the compiler from inlining (i.e., 
it's not a hint). Even though it may *know* (i.e., from PGO) that this 
is a common path (e.g., for cache-less memory pools).

> +rte_mempool_do_generic_put_many(struct rte_mempool *mp, void * const *obj_table,
> +		unsigned int n, struct rte_mempool_cache *cache)
>   {
>   	void **cache_objs;
> +	unsigned int len;
> +	const uint32_t cache_size = cache->size;
>   
> -	/* No cache provided */
> -	if (unlikely(cache == NULL))
> -		goto driver_enqueue;
> -
> -	/* increment stat now, adding in mempool always success */
> +	/* Increment stat now, adding in mempool always succeeds. */
>   	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
>   	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>   
> -	/* The request itself is too big for the cache */
> -	if (unlikely(n > cache->flushthresh))
> -		goto driver_enqueue_stats_incremented;
> -
> -	/*
> -	 * The cache follows the following algorithm:
> -	 *   1. If the objects cannot be added to the cache without crossing
> -	 *      the flush threshold, flush the cache to the backend.
> -	 *   2. Add the objects to the cache.
> -	 */
> -
> -	if (cache->len + n <= cache->flushthresh) {
> -		cache_objs = &cache->objs[cache->len];
> -		cache->len += n;
> -	} else {
> -		cache_objs = &cache->objs[0];
> -		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> -		cache->len = n;
> +	/* Fill the cache with the first objects. */
> +	cache_objs = &cache->objs[cache->len];
> +	len = (cache_size - cache->len);
> +	rte_memcpy(cache_objs, obj_table, sizeof(void *) * len);
> +	obj_table += len;
> +	n -= len;
> +
> +	/* Flush the entire cache to the backend. */
> +	cache_objs = &cache->objs[0];
> +	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
> +
> +	if (unlikely(n > cache_size)) {
> +		/* Push following objects, in entire cache sizes, directly to the backend. */
> +		len = n - n % cache_size;
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table, len);
> +		obj_table += len;
> +		n -= len;
>   	}
>   
> -	/* Add the objects to the cache. */
> +	/* Add the remaining objects to the cache. */
> +	cache->len = n;
>   	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> +}
>   
> -	return;
> -
> -driver_enqueue:
> -
> -	/* increment stat now, adding in mempool always success */
> -	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> -
> -driver_enqueue_stats_incremented:
> +/**
> + * @internal Put several objects back in the mempool; used internally.
> + * @param mp
> + *   A pointer to the mempool structure.
> + * @param obj_table
> + *   A pointer to a table of void * pointers (objects).
> + * @param n
> + *   The number of objects to store back in the mempool, must be strictly
> + *   positive.
> + * @param cache
> + *   A pointer to a mempool cache structure. May be NULL if not needed.
> + */
> +static __rte_always_inline void
> +rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> +			   unsigned int n, struct rte_mempool_cache *cache)
> +{
> +	if (likely(cache != NULL)) {
> +		/* Enough remaining space in the cache? */
> +		if (likely(cache->len + n <= cache->size)) {
> +			void **cache_objs;
> +
> +			/* Increment stat now, adding in mempool always succeeds. */
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
> +			/* Add the objects to the cache. */
> +			cache_objs = &cache->objs[cache->len];
> +			cache->len += n;
> +			rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> +		} else
> +			rte_mempool_do_generic_put_many(mp, obj_table, n, cache);
> +	} else {
> +		/* Increment stat now, adding in mempool always succeeds. */
> +		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> +		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
>   
> -	/* push objects to the backend */
> -	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
> +		/* push objects to the backend */
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
> +	}
>   }
>   
>   
> @@ -1490,135 +1517,185 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
>   }
>   
>   /**
> - * @internal Get several objects from the mempool; used internally.
> + * @internal Get several objects from the mempool; used internally when
> + *   the number of objects exceeds what is available in the mempool cache.
>    * @param mp
>    *   A pointer to the mempool structure.
>    * @param obj_table
>    *   A pointer to a table of void * pointers (objects).
>    * @param n
>    *   The number of objects to get, must be strictly positive.
> + *   Must be more than available in the mempool cache, i.e.:
> + *   n > cache->len
>    * @param cache
> - *   A pointer to a mempool cache structure. May be NULL if not needed.
> + *   A pointer to a mempool cache structure. Not NULL.
>    * @return
>    *   - 0: Success.
>    *   - <0: Error; code of driver dequeue function.
>    */
> -static __rte_always_inline int
> -rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> -			   unsigned int n, struct rte_mempool_cache *cache)
> +static __rte_noinline int
> +rte_mempool_do_generic_get_many(struct rte_mempool *mp, void **obj_table,
> +		unsigned int n, struct rte_mempool_cache *cache)
>   {
>   	int ret;
>   	unsigned int remaining;
>   	uint32_t index, len;
>   	void **cache_objs;
> +	const uint32_t cache_size = cache->size;
>   
> -	/* No cache provided */
> -	if (unlikely(cache == NULL)) {
> -		remaining = n;
> -		goto driver_dequeue;
> -	}
> -
> -	/* The cache is a stack, so copy will be in reverse order. */
> +	/* Serve the first part of the request from the cache to return hot objects first. */
>   	cache_objs = &cache->objs[cache->len];
> +	len = cache->len;
> +	remaining = n - len;
> +	for (index = 0; index < len; index++)
> +		*obj_table++ = *--cache_objs;
>   
> -	if (__rte_constant(n) && n <= cache->len) {
> +	/* At this point, the cache is empty. */
> +
> +	/* More than can be served from a full cache? */
> +	if (unlikely(remaining >= cache_size)) {
>   		/*
> -		 * The request size is known at build time, and
> -		 * the entire request can be satisfied from the cache,
> -		 * so let the compiler unroll the fixed length copy loop.
> +		 * Serve the following part of the request directly from the backend
> +		 * in multipla of the cache size.
>   		 */
> -		cache->len -= n;
> -		for (index = 0; index < n; index++)
> -			*obj_table++ = *--cache_objs;
> +		len = remaining - remaining % cache_size;
> +		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
> +		if (unlikely(ret < 0)) {
> +			/*
> +			 * No further action is required to roll back the request,
> +			 * as objects in the cache are intact, and no objects have
> +			 * been dequeued from the backend.
> +			 */
>   
> -		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> -		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>   
> -		return 0;
> -	}
> +			return ret;
> +		}
>   
> -	/*
> -	 * Use the cache as much as we have to return hot objects first.
> -	 * If the request size 'n' is known at build time, the above comparison
> -	 * ensures that n > cache->len here, so omit RTE_MIN().
> -	 */
> -	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
> -	cache->len -= len;
> -	remaining = n - len;
> -	for (index = 0; index < len; index++)
> -		*obj_table++ = *--cache_objs;
> +		remaining -= len;
> +		obj_table += len;
>   
> -	/*
> -	 * If the request size 'n' is known at build time, the case
> -	 * where the entire request can be satisfied from the cache
> -	 * has already been handled above, so omit handling it here.
> -	 */
> -	if (!__rte_constant(n) && remaining == 0) {
> -		/* The entire request is satisfied from the cache. */
> +		if (unlikely(remaining == 0)) {
> +			cache->len = 0;
>   
> -		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> -		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
> -		return 0;
> +			return 0;
> +		}
>   	}
>   
> -	/* if dequeue below would overflow mem allocated for cache */
> -	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> -		goto driver_dequeue;
> -
> -	/* Fill the cache from the backend; fetch size + remaining objects. */
> -	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> -			cache->size + remaining);
> +	/* Fill the entire cache from the backend. */
> +	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
>   	if (unlikely(ret < 0)) {
>   		/*
> -		 * We are buffer constrained, and not able to allocate
> -		 * cache + remaining.
> -		 * Do not fill the cache, just satisfy the remaining part of
> -		 * the request directly from the backend.
> +		 * Unable to fill the cache.
> +		 * Last resort: Try only the remaining part of the request,
> +		 * served directly from the backend.
>   		 */
> -		goto driver_dequeue;
> -	}
> +		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
> +		if (unlikely(ret == 0)) {
> +			cache->len = 0;
>   
> -	/* Satisfy the remaining part of the request from the filled cache. */
> -	cache_objs = &cache->objs[cache->size + remaining];
> -	for (index = 0; index < remaining; index++)
> -		*obj_table++ = *--cache_objs;
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +
> +			return 0;
> +		}
> +
> +		/* Roll back. */
> +		if (cache->len + remaining == n) {
> +			/*
> +			 * No further action is required to roll back the request,
> +			 * as objects in the cache are intact, and no objects have
> +			 * been dequeued from the backend.
> +			 */
> +		} else {
> +			/* Update the state of the cache before putting back the objects. */
> +			cache->len = 0;
> +
> +			len = n - remaining;
> +			obj_table -= len;
> +			rte_mempool_do_generic_put(mp, obj_table, len, cache);
> +		}
>   
> -	cache->len = cache->size;
> +		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> +		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
> +
> +		return ret;
> +	}
>   
> +	/* Increment stat now, this always succeeds. */
>   	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
>   	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
> +	/* Serve the remaining part of the request from the filled cache. */
> +	cache_objs = &cache->objs[cache_size];
> +	for (index = 0; index < remaining; index++)
> +		*obj_table++ = *--cache_objs;
> +
> +	cache->len = cache_size - remaining;
> +
>   	return 0;
> +}
>   
> -driver_dequeue:
> +/**
> + * @internal Get several objects from the mempool; used internally.
> + * @param mp
> + *   A pointer to the mempool structure.
> + * @param obj_table
> + *   A pointer to a table of void * pointers (objects).
> + * @param n
> + *   The number of objects to get, must be strictly positive.
> + * @param cache
> + *   A pointer to a mempool cache structure. May be NULL if not needed.
> + * @return
> + *   - 0: Success.
> + *   - <0: Error; code of driver dequeue function.
> + */
> +static __rte_always_inline int
> +rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> +			   unsigned int n, struct rte_mempool_cache *cache)
> +{
> +	if (likely(cache != NULL)) {
> +		/* Enough objects in the cache? */
> +		if (n <= cache->len) {
> +			unsigned int index;
> +			void **cache_objs;
>   
> -	/* Get remaining objects directly from the backend. */
> -	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
> +			/* Increment stat now, this always succeeds. */
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
> -	if (ret < 0) {
> -		if (likely(cache != NULL)) {
> -			cache->len = n - remaining;
>   			/*
> -			 * No further action is required to roll the first part
> -			 * of the request back into the cache, as objects in
> -			 * the cache are intact.
> +			 * The cache is a stack, so copy will be in reverse order.
> +			 * If the request size is known at build time,
> +			 * the compiler will unroll the fixed length copy loop.
>   			 */
> -		}
> -
> -		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
> +			cache_objs = &cache->objs[cache->len];
> +			cache->len -= n;
> +			for (index = 0; index < n; index++)
> +				*obj_table++ = *--cache_objs;
> +
> +			return 0;
> +		} else
> +			return rte_mempool_do_generic_get_many(mp, obj_table, n, cache);
>   	} else {
> -		if (likely(cache != NULL)) {
> -			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> -			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +		int ret;
> +
> +		/* Get the objects directly from the backend. */
> +		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
> +		if (unlikely(ret < 0)) {
> +			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>   		} else {
>   			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
>   			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
>   		}
> -	}
>   
> -	return ret;
> +		return ret;
> +	}
>   }
>   
>   /**


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v4] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
  2024-09-20 16:37 ` [RFC PATCH v2] " Morten Brørup
  2024-09-20 17:13 ` [RFC PATCH v3] " Morten Brørup
@ 2024-09-22 10:50 ` Morten Brørup
  2024-09-24  3:58 ` [RFC PATCH v5] " Morten Brørup
                   ` (15 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-22 10:50 UTC (permalink / raw)
  To: dev, Mattias Rönnblom; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
non-inline helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 ++--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     | 155 ++++++++++-
 lib/mempool/rte_mempool.h                     | 250 +++++++-----------
 lib/mempool/version.map                       |   4 +
 10 files changed, 289 insertions(+), 276 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..59b24ddd2d 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -33,6 +33,149 @@
 
 RTE_LOG_REGISTER_DEFAULT(rte_mempool_logtype, INFO);
 
+void
+rte_mempool_do_generic_put_many(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
+{
+	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
+
+	/* Increment stat now, adding in mempool always succeeds. */
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	if (unlikely(n > cache_size)) {
+		/* Push following objects, in entire cache sizes, directly to the backend. */
+		len = n - n % cache_size;
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, len);
+		obj_table += len;
+		n -= len;
+	}
+
+	/* Add the remaining objects to the cache. */
+	cache->len = n;
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
+
+int
+rte_mempool_do_generic_get_many(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
+{
+	int ret;
+	unsigned int remaining;
+	uint32_t index, len;
+	void **cache_objs;
+	const uint32_t cache_size = cache->size;
+
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
+
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
+		/*
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
+		 */
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+			return ret;
+		}
+
+		remaining -= len;
+		obj_table += len;
+
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
+
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+	}
+
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
+	if (unlikely(ret < 0)) {
+		/*
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
+		 */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
+
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
+	}
+
+	/* Increment stat now, this always succeeds. */
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
+	for (index = 0; index < remaining; index++)
+		*obj_table++ = *--cache_objs;
+
+	cache->len = cache_size - remaining;
+
+	return 0;
+}
+
 TAILQ_HEAD(rte_mempool_list, rte_tailq_entry);
 
 static struct rte_tailq_elem rte_mempool_tailq = {
@@ -50,11 +193,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +884,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +973,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..4af519c409 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1362,6 +1360,29 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	cache->len = 0;
 }
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
+ * @param cache
+ *   A pointer to a mempool cache structure. Not NULL.
+ */
+__rte_experimental
+void
+rte_mempool_do_generic_put_many(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache *cache);
+
 /**
  * @internal Put several objects back in the mempool; used internally.
  * @param mp
@@ -1378,51 +1399,30 @@ static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 			   unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
-
-	/* No cache provided */
-	if (unlikely(cache == NULL))
-		goto driver_enqueue;
-
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
-
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
+	if (!(__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE) &&
+			likely(cache != NULL)) {
+		/* Enough remaining space in the cache? */
+		if (likely(cache->len + n <= cache->size)) {
+			void **cache_objs;
+
+			/* Increment stat now, adding in mempool always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
+			/* Add the objects to the cache. */
+			cache_objs = &cache->objs[cache->len];
+			cache->len += n;
+			rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		} else
+			rte_mempool_do_generic_put_many(mp, obj_table, n, cache);
 	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
-
-	/* Add the objects to the cache. */
-	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
-
-	return;
-
-driver_enqueue:
+		/* Increment stat now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
-
-	/* push objects to the backend */
-	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+		/* Push the objects directly to the backend. */
+		rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
+	}
 }
 
 
@@ -1489,6 +1489,31 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 	rte_mempool_put_bulk(mp, &obj, 1);
 }
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
+ * @param cache
+ *   A pointer to a mempool cache structure. Not NULL.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+__rte_experimental
+int
+rte_mempool_do_generic_get_many(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache *cache);
+
 /**
  * @internal Get several objects from the mempool; used internally.
  * @param mp
@@ -1507,118 +1532,45 @@ static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 			   unsigned int n, struct rte_mempool_cache *cache)
 {
-	int ret;
-	unsigned int remaining;
-	uint32_t index, len;
-	void **cache_objs;
-
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
-
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
-
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
-	if (unlikely(ret < 0)) {
-		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
-		 */
-		goto driver_dequeue;
-	}
-
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
-
-	cache->len = cache->size;
-
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-	return 0;
-
-driver_dequeue:
-
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	if (!(__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE) &&
+			likely(cache != NULL)) {
+		/* Enough objects in the cache? */
+		if (n <= cache->len) {
+			unsigned int index;
+			void **cache_objs;
+
+			/* Increment stat now, this always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
 			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
 			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_many(mp, obj_table, n, cache);
 	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
diff --git a/lib/mempool/version.map b/lib/mempool/version.map
index 6f16d417ae..9345376e64 100644
--- a/lib/mempool/version.map
+++ b/lib/mempool/version.map
@@ -54,6 +54,10 @@ EXPERIMENTAL {
 	# added in 24.07
 	rte_mempool_get_mem_range;
 	rte_mempool_get_obj_alignment;
+
+	# added in 24.11
+	rte_mempool_do_generic_put_many;
+	rte_mempool_do_generic_get_many;
 };
 
 INTERNAL {
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v5] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (2 preceding siblings ...)
  2024-09-22 10:50 ` [RFC PATCH v4] mempool: fix mempool " Morten Brørup
@ 2024-09-24  3:58 ` Morten Brørup
  2024-09-24 11:58 ` [RFC PATCH v6] " Morten Brørup
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-24  3:58 UTC (permalink / raw)
  To: dev, Mattias Rönnblom; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
separate helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +---
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 284 +++++++++++-------
 9 files changed, 223 insertions(+), 229 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..dacf0d7d98 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1362,6 +1360,47 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	cache->len = 0;
 }
 
+/**
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
+ *   Must be less than the size of the mempool cache, i.e.:
+ *   n < cache->size
+ * @param cache
+ *   A pointer to a mempool cache structure. Not NULL.
+ */
+static void
+rte_mempool_do_generic_put_split(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
+{
+	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
+
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	/* Add the remaining objects to the cache. */
+	cache->len = n;
+	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
+
 /**
  * @internal Put several objects back in the mempool; used internally.
  * @param mp
@@ -1378,50 +1417,42 @@ static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 			   unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
-
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
+	/* The request itself is known to be too big for any cache? */
+	if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
 		goto driver_enqueue_stats_incremented;
 
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* Enough remaining space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
+		void **cache_objs;
 
-	if (cache->len + n <= cache->flushthresh) {
+		/* Add the objects to the cache. */
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
-
-	/* Add the objects to the cache. */
-	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+	} else if (likely(n < cache->size))
+		rte_mempool_do_generic_put_split(mp, obj_table, n, cache);
+	else
+		goto driver_enqueue_stats_incremented;
 
 	return;
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
 driver_enqueue_stats_incremented:
 
-	/* push objects to the backend */
+	/* Push the objects directly to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1490,135 +1521,184 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static int
+rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache *cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
+
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+	cache->len = cache_size - remaining;
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache *cache)
+{
+	if (likely(cache != NULL)) {
+		/* Enough objects in the cache? */
+		if (likely(n <= cache->len)) {
+			unsigned int index;
+			void **cache_objs;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+			/* Increment stat now, this always succeeds. */
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
 			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
 			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_split(mp, obj_table, n, cache);
 	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v6] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (3 preceding siblings ...)
  2024-09-24  3:58 ` [RFC PATCH v5] " Morten Brørup
@ 2024-09-24 11:58 ` Morten Brørup
  2024-09-24 18:12 ` [RFC PATCH v7] " Morten Brørup
                   ` (13 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-24 11:58 UTC (permalink / raw)
  To: dev, Mattias Rönnblom; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
separate helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +---
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 287 ++++++++++++------
 9 files changed, 231 insertions(+), 224 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..0801cec24a 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1362,6 +1360,48 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	cache->len = 0;
 }
 
+/**
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
+ *   Must be less than the size of the mempool cache, i.e.:
+ *   n < cache->size
+ * @param cache
+ *   A pointer to a mempool cache structure. Not NULL.
+ */
+static void
+rte_mempool_do_generic_put_split(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
+{
+	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
+
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	cache->len = n - len; /* Moved to here (for performance). */
+	/* rte_ */ memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	/* Add the remaining objects to the cache. */
+	/* Moved from here (for performance): cache->len = n; */
+	/* rte_ */ memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
+
 /**
  * @internal Put several objects back in the mempool; used internally.
  * @param mp
@@ -1376,52 +1416,44 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
-	void **cache_objs;
-
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
+	/* The request itself is known to be too big for any cache? */
+	if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
 		goto driver_enqueue_stats_incremented;
 
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* Enough remaining space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
+		void **cache_objs;
 
-	if (cache->len + n <= cache->flushthresh) {
+		/* Add the objects to the cache. */
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
-
-	/* Add the objects to the cache. */
-	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+	} else if (likely(n < cache->size))
+		rte_mempool_do_generic_put_split(mp, obj_table, n, cache);
+	else
+		goto driver_enqueue_stats_incremented;
 
 	return;
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
 driver_enqueue_stats_incremented:
 
-	/* push objects to the backend */
+	/* Push the objects directly to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1490,122 +1522,183 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static int
+rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
+
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+	cache->len = cache_size - remaining;
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache * const cache)
+{
+	int ret;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
+	/* The request itself is known to be too big for any cache? */
+	if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
+		goto driver_dequeue;
+
+	/* The request can be served entirely from the cache? */
+	if (likely(n <= cache->len)) {
+		unsigned int index;
+		void **cache_objs;
 
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+		/*
+		 * The cache is a stack, so copy will be in reverse order.
+		 * If the request size is known at build time,
+		 * the compiler will unroll the fixed length copy loop.
+		 */
+		cache_objs = &cache->objs[cache->len];
+		cache->len -= n;
+		for (index = 0; index < n; index++)
+			*obj_table++ = *--cache_objs;
+
+		return 0;
+	} else
+		return rte_mempool_do_generic_get_split(mp, obj_table, n, cache);
+
+driver_dequeue:
+
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+	if (unlikely(ret < 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v7] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (4 preceding siblings ...)
  2024-09-24 11:58 ` [RFC PATCH v6] " Morten Brørup
@ 2024-09-24 18:12 ` Morten Brørup
  2024-09-24 20:44   ` Patrick Robb
  2024-09-25 21:33 ` [RFC PATCH v8] " Morten Brørup
                   ` (12 subsequent siblings)
  18 siblings, 1 reply; 22+ messages in thread
From: Morten Brørup @ 2024-09-24 18:12 UTC (permalink / raw)
  To: dev; +Cc: Mattias Rönnblom, Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
separate helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 config/rte_config.h                           |   2 +-
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +---
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 287 ++++++++++++------
 10 files changed, 232 insertions(+), 225 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index dd7bb0d35b..2488ff167d 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -56,7 +56,7 @@
 #define RTE_CONTIGMEM_DEFAULT_BUF_SIZE (512*1024*1024)
 
 /* mempool defines */
-#define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+#define RTE_MEMPOOL_CACHE_MAX_SIZE 1024
 /* RTE_LIBRTE_MEMPOOL_STATS is not set */
 /* RTE_LIBRTE_MEMPOOL_DEBUG is not set */
 
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..0801cec24a 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
+	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
 };
 
 /**
@@ -1362,6 +1360,48 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	cache->len = 0;
 }
 
+/**
+ * @internal Put several objects back in the mempool; used internally when
+ *   the number of objects exceeds the remaining space in the mempool cache.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to store back in the mempool, must be strictly
+ *   positive.
+ *   Must be more than the remaining space in the mempool cache, i.e.:
+ *   cache->len + n > cache->size
+ *   Must be less than the size of the mempool cache, i.e.:
+ *   n < cache->size
+ * @param cache
+ *   A pointer to a mempool cache structure. Not NULL.
+ */
+static void
+rte_mempool_do_generic_put_split(struct rte_mempool *mp, void * const *obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
+{
+	void **cache_objs;
+	unsigned int len;
+	const uint32_t cache_size = cache->size;
+
+	/* Fill the cache with the first objects. */
+	cache_objs = &cache->objs[cache->len];
+	len = (cache_size - cache->len);
+	cache->len = n - len; /* Moved to here (for performance). */
+	/* rte_ */ memcpy(cache_objs, obj_table, sizeof(void *) * len);
+	obj_table += len;
+	n -= len;
+
+	/* Flush the entire cache to the backend. */
+	cache_objs = &cache->objs[0];
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
+
+	/* Add the remaining objects to the cache. */
+	/* Moved from here (for performance): cache->len = n; */
+	/* rte_ */ memcpy(cache_objs, obj_table, sizeof(void *) * n);
+}
+
 /**
  * @internal Put several objects back in the mempool; used internally.
  * @param mp
@@ -1376,52 +1416,44 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
-	void **cache_objs;
-
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
+	/* The request itself is known to be too big for any cache? */
+	if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
 		goto driver_enqueue_stats_incremented;
 
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* Enough remaining space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
+		void **cache_objs;
 
-	if (cache->len + n <= cache->flushthresh) {
+		/* Add the objects to the cache. */
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
-
-	/* Add the objects to the cache. */
-	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+		rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
+	} else if (likely(n < cache->size))
+		rte_mempool_do_generic_put_split(mp, obj_table, n, cache);
+	else
+		goto driver_enqueue_stats_incremented;
 
 	return;
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
 driver_enqueue_stats_incremented:
 
-	/* push objects to the backend */
+	/* Push the objects directly to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1490,122 +1522,183 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static int
+rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* At this point, the cache is empty. */
+
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
+
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+	cache->len = cache_size - remaining;
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache * const cache)
+{
+	int ret;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
+	/* The request itself is known to be too big for any cache? */
+	if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
+		goto driver_dequeue;
+
+	/* The request can be served entirely from the cache? */
+	if (likely(n <= cache->len)) {
+		unsigned int index;
+		void **cache_objs;
 
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+		/*
+		 * The cache is a stack, so copy will be in reverse order.
+		 * If the request size is known at build time,
+		 * the compiler will unroll the fixed length copy loop.
+		 */
+		cache_objs = &cache->objs[cache->len];
+		cache->len -= n;
+		for (index = 0; index < n; index++)
+			*obj_table++ = *--cache_objs;
+
+		return 0;
+	} else
+		return rte_mempool_do_generic_get_split(mp, obj_table, n, cache);
+
+driver_dequeue:
+
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+	if (unlikely(ret < 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC PATCH v7] mempool: fix mempool cache size
  2024-09-24 18:12 ` [RFC PATCH v7] " Morten Brørup
@ 2024-09-24 20:44   ` Patrick Robb
  0 siblings, 0 replies; 22+ messages in thread
From: Patrick Robb @ 2024-09-24 20:44 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev, Mattias Rönnblom

[-- Attachment #1: Type: text/plain, Size: 37732 bytes --]

Recheck-request: iol-intel-Performance

On Tue, Sep 24, 2024 at 2:12 PM Morten Brørup <mb@smartsharesystems.com>
wrote:

> This patch refactors the mempool cache to fix two bugs:
> 1. When a mempool is created with a cache size of N objects, the cache was
> actually created with a size of 1.5 * N objects.
> 2. The mempool cache field names did not reflect their purpose;
> the "flushthresh" field held the size, and the "size" field held the
> number of objects remaining in the cache when returning from a get
> operation refilling it from the backend.
>
> Especially the first item could be fatal:
> When more objects than a mempool's configured cache size is held in the
> mempool's caches associated with other lcores, a rightsized mempool may
> unexpectedly run out of objects, causing the application to fail.
>
> Furthermore, this patch introduces two optimizations:
> 1. The mempool caches are flushed to/filled from the backend in their
> entirety, so backend accesses are CPU cache line aligned. (Assuming the
> mempool cache size is a multiplum of a CPU cache line size divided by the
> size of a pointer.)
> 2. The unlikely paths in the get and put functions, where the cache is
> flushed to/filled from the backend, are moved from the inline functions to
> separate helper functions, thereby reducing the code size of the inline
> functions.
> Note: Accessing the backend for cacheless mempools remains inline.
>
> Various drivers accessing the mempool directly have been updated
> accordingly.
> These drivers did not update mempool statistics when accessing the mempool
> directly, so that is fixed too.
>
> Note: Performance not yet benchmarked.
>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v7:
> * Increased max mempool cache size from 512 to 1024 objects.
>   Mainly for CI performance test purposes.
>   Originally, the max mempool cache size was 768 objects, and used a fixed
>   size array of 1024 objects in the mempool cache structure.
> v6:
> * Fix v5 incomplete implementation of passing large requests directly to
>   the backend.
> * Use memcpy instead of rte_memcpy where compiler complains about it.
> * Added const to some function parameters.
> v5:
> * Moved helper functions back into the header file, for improved
>   performance.
> * Pass large requests directly to the backend. This also simplifies the
>   code.
> v4:
> * Updated subject to reflect that misleading names are considered bugs.
> * Rewrote patch description to provide more details about the bugs fixed.
>   (Mattias Rönnblom)
> * Moved helper functions, not to be inlined, to mempool C file.
>   (Mattias Rönnblom)
> * Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
>   time directly to backend driver, to avoid calling the helper functions.
>   This also fixes the compiler warnings about out of bounds array access.
> v3:
> * Removed __attribute__(assume).
> v2:
> * Removed mempool perf test; not part of patch set.
> ---
>  config/rte_config.h                           |   2 +-
>  drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 +---
>  drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
>  drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
>  drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
>  drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
>  drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
>  lib/mempool/mempool_trace.h                   |   1 -
>  lib/mempool/rte_mempool.c                     |  12 +-
>  lib/mempool/rte_mempool.h                     | 287 ++++++++++++------
>  10 files changed, 232 insertions(+), 225 deletions(-)
>
> diff --git a/config/rte_config.h b/config/rte_config.h
> index dd7bb0d35b..2488ff167d 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -56,7 +56,7 @@
>  #define RTE_CONTIGMEM_DEFAULT_BUF_SIZE (512*1024*1024)
>
>  /* mempool defines */
> -#define RTE_MEMPOOL_CACHE_MAX_SIZE 512
> +#define RTE_MEMPOOL_CACHE_MAX_SIZE 1024
>  /* RTE_LIBRTE_MEMPOOL_STATS is not set */
>  /* RTE_LIBRTE_MEMPOOL_DEBUG is not set */
>
> diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c
> b/drivers/common/idpf/idpf_common_rxtx_avx512.c
> index 3b5e124ec8..98535a48f3 100644
> --- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
> +++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
> @@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct
> idpf_tx_queue *txq)
>
> rte_lcore_id());
>                 void **cache_objs;
>
> -               if (cache == NULL || cache->len == 0)
> -                       goto normal;
> -
> -               cache_objs = &cache->objs[cache->len];
> -
> -               if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -                       rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +               if (!cache || unlikely(n + cache->len > cache->size)) {
> +                       rte_mempool_generic_put(mp, (void *)txep, n,
> cache);
>                         goto done;
>                 }
>
> -               /* The cache follows the following algorithm
> -                *   1. Add the objects to the cache
> -                *   2. Anything greater than the cache min value (if it
> crosses the
> -                *   cache flush threshold) is flushed to the ring.
> -                */
> +               cache_objs = &cache->objs[cache->len];
> +
>                 /* Add elements back into the cache */
>                 uint32_t copied = 0;
>                 /* n is multiple of 32 */
> @@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct
> idpf_tx_queue *txq)
>                 }
>                 cache->len += n;
>
> -               if (cache->len >= cache->flushthresh) {
> -                       rte_mempool_ops_enqueue_bulk(mp,
> -
> &cache->objs[cache->size],
> -                                                    cache->len -
> cache->size);
> -                       cache->len = cache->size;
> -               }
> +               /* Increment stat. */
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>                 goto done;
>         }
>
> -normal:
>         m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>         if (likely(m != NULL)) {
>                 free[0] = m;
> @@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct
> idpf_tx_queue *txq)
>
> rte_lcore_id());
>                 void **cache_objs;
>
> -               if (!cache || cache->len == 0)
> -                       goto normal;
> -
> -               cache_objs = &cache->objs[cache->len];
> -
> -               if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -                       rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +               if (!cache || unlikely(n + cache->len > cache->size)) {
> +                       rte_mempool_generic_put(mp, (void *)txep, n,
> cache);
>                         goto done;
>                 }
>
> -               /* The cache follows the following algorithm
> -                *   1. Add the objects to the cache
> -                *   2. Anything greater than the cache min value (if it
> crosses the
> -                *   cache flush threshold) is flushed to the ring.
> -                */
> +               cache_objs = &cache->objs[cache->len];
> +
>                 /* Add elements back into the cache */
>                 uint32_t copied = 0;
>                 /* n is multiple of 32 */
> @@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct
> idpf_tx_queue *txq)
>                 }
>                 cache->len += n;
>
> -               if (cache->len >= cache->flushthresh) {
> -                       rte_mempool_ops_enqueue_bulk(mp,
> -
> &cache->objs[cache->size],
> -                                                    cache->len -
> cache->size);
> -                       cache->len = cache->size;
> -               }
> +               /* Increment stat. */
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>                 goto done;
>         }
>
> -normal:
>         m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>         if (likely(m)) {
>                 free[0] = m;
> diff --git a/drivers/mempool/dpaa/dpaa_mempool.c
> b/drivers/mempool/dpaa/dpaa_mempool.c
> index 74bfcab509..3a936826c8 100644
> --- a/drivers/mempool/dpaa/dpaa_mempool.c
> +++ b/drivers/mempool/dpaa/dpaa_mempool.c
> @@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
>         struct bman_pool_params params = {
>                 .flags = BMAN_POOL_FLAG_DYNAMIC_BPID
>         };
> -       unsigned int lcore_id;
> -       struct rte_mempool_cache *cache;
>
>         MEMPOOL_INIT_FUNC_TRACE();
>
> @@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
>         rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
>                    sizeof(struct dpaa_bp_info));
>         mp->pool_data = (void *)bp_info;
> -       /* Update per core mempool cache threshold to optimal value which
> is
> -        * number of buffers that can be released to HW buffer pool in
> -        * a single API call.
> -        */
> -       for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> -               cache = &mp->local_cache[lcore_id];
> -               DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
> -                       lcore_id, cache->flushthresh,
> -                       (uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
> -               if (cache->flushthresh)
> -                       cache->flushthresh = cache->size +
> DPAA_MBUF_MAX_ACQ_REL;
> -       }
>
>         DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
>         return 0;
> @@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
>         DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
>                              count, bp_info->bpid);
>
> -       if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
> +       if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
>                 DPAA_MEMPOOL_ERR("Unable to allocate requested (%u)
> buffers",
>                                  count);
>                 return -1;
> diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> index 42e17d984c..a44f3cf616 100644
> --- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> +++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
> @@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
>         struct dpaa2_bp_info *bp_info;
>         struct dpbp_attr dpbp_attr;
>         uint32_t bpid;
> -       unsigned int lcore_id;
> -       struct rte_mempool_cache *cache;
>         int ret;
>
>         avail_dpbp = dpaa2_alloc_dpbp_dev();
> @@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
>         DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d",
> dpbp_attr.bpid);
>
>         h_bp_list = bp_list;
> -       /* Update per core mempool cache threshold to optimal value which
> is
> -        * number of buffers that can be released to HW buffer pool in
> -        * a single API call.
> -        */
> -       for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> -               cache = &mp->local_cache[lcore_id];
> -               DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d ->
> %d",
> -                       lcore_id, cache->flushthresh,
> -                       (uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
> -               if (cache->flushthresh)
> -                       cache->flushthresh = cache->size +
> DPAA2_MBUF_MAX_ACQ_REL;
> -       }
>
>         return 0;
>  err3:
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> index 0238b03f8a..712ab1726f 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> @@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
>                 struct rte_mempool_cache *cache =
> rte_mempool_default_cache(mp,
>                                 rte_lcore_id());
>
> -               if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> +               if (!cache || unlikely(n + cache->len > cache->size)) {
>                         rte_mempool_generic_put(mp, (void *)txep, n,
> cache);
>                         goto done;
>                 }
>
>                 cache_objs = &cache->objs[cache->len];
>
> -               /* The cache follows the following algorithm
> -                *   1. Add the objects to the cache
> -                *   2. Anything greater than the cache min value (if it
> -                *   crosses the cache flush threshold) is flushed to the
> ring.
> -                */
>                 /* Add elements back into the cache */
>                 uint32_t copied = 0;
>                 /* n is multiple of 32 */
> @@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
>                 }
>                 cache->len += n;
>
> -               if (cache->len >= cache->flushthresh) {
> -                       rte_mempool_ops_enqueue_bulk
> -                               (mp, &cache->objs[cache->size],
> -                               cache->len - cache->size);
> -                       cache->len = cache->size;
> -               }
> +               /* Increment stat. */
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>                 goto done;
>         }
>
> diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> index 3bb6f305df..307bb8556a 100644
> --- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> +++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
> @@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
>
> rte_lcore_id());
>                 void **cache_objs;
>
> -               if (!cache || cache->len == 0)
> -                       goto normal;
> -
> -               cache_objs = &cache->objs[cache->len];
> -
> -               if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -                       rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +               if (!cache || unlikely(n + cache->len > cache->size)) {
> +                       rte_mempool_generic_put(mp, (void *)txep, n,
> cache);
>                         goto done;
>                 }
>
> -               /* The cache follows the following algorithm
> -                *   1. Add the objects to the cache
> -                *   2. Anything greater than the cache min value (if it
> crosses the
> -                *   cache flush threshold) is flushed to the ring.
> -                */
> +               cache_objs = &cache->objs[cache->len];
> +
>                 /* Add elements back into the cache */
>                 uint32_t copied = 0;
>                 /* n is multiple of 32 */
> @@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
>                 }
>                 cache->len += n;
>
> -               if (cache->len >= cache->flushthresh) {
> -                       rte_mempool_ops_enqueue_bulk(mp,
> -
> &cache->objs[cache->size],
> -                                                    cache->len -
> cache->size);
> -                       cache->len = cache->size;
> -               }
> +               /* Increment stat. */
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>                 goto done;
>         }
>
> -normal:
>         m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>         if (likely(m)) {
>                 free[0] = m;
> diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c
> b/drivers/net/ice/ice_rxtx_vec_avx512.c
> index 04148e8ea2..4ea1db734e 100644
> --- a/drivers/net/ice/ice_rxtx_vec_avx512.c
> +++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
> @@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
>                 struct rte_mempool_cache *cache =
> rte_mempool_default_cache(mp,
>                                 rte_lcore_id());
>
> -               if (!cache || cache->len == 0)
> -                       goto normal;
> -
> -               cache_objs = &cache->objs[cache->len];
> -
> -               if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> -                       rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
> +               if (!cache || unlikely(n + cache->len > cache->size)) {
> +                       rte_mempool_generic_put(mp, (void *)txep, n,
> cache);
>                         goto done;
>                 }
>
> -               /* The cache follows the following algorithm
> -                *   1. Add the objects to the cache
> -                *   2. Anything greater than the cache min value (if it
> -                *   crosses the cache flush threshold) is flushed to the
> ring.
> -                */
> +               cache_objs = &cache->objs[cache->len];
> +
>                 /* Add elements back into the cache */
>                 uint32_t copied = 0;
>                 /* n is multiple of 32 */
> @@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
>                 }
>                 cache->len += n;
>
> -               if (cache->len >= cache->flushthresh) {
> -                       rte_mempool_ops_enqueue_bulk
> -                               (mp, &cache->objs[cache->size],
> -                                cache->len - cache->size);
> -                       cache->len = cache->size;
> -               }
> +               /* Increment stat. */
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> +
>                 goto done;
>         }
>
> -normal:
>         m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
>         if (likely(m)) {
>                 free[0] = m;
> diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
> index dffef062e4..3c49b41a6d 100644
> --- a/lib/mempool/mempool_trace.h
> +++ b/lib/mempool/mempool_trace.h
> @@ -112,7 +112,6 @@ RTE_TRACE_POINT(
>         rte_trace_point_emit_i32(socket_id);
>         rte_trace_point_emit_ptr(cache);
>         rte_trace_point_emit_u32(cache->len);
> -       rte_trace_point_emit_u32(cache->flushthresh);
>  )
>
>  RTE_TRACE_POINT(
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index d8e39e5c20..40fb13239a 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -50,11 +50,6 @@ static void
>  mempool_event_callback_invoke(enum rte_mempool_event event,
>                               struct rte_mempool *mp);
>
> -/* Note: avoid using floating point since that compiler
> - * may not think that is constant.
> - */
> -#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
> -
>  #if defined(RTE_ARCH_X86)
>  /*
>   * return the greatest common divisor between a and b (fast algorithm)
> @@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
>  static void
>  mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
>  {
> -       /* Check that cache have enough space for flush threshold */
> -
>  RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
> +       /* Check that cache have enough space for size */
> +       RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
>                          RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
>                          RTE_SIZEOF_FIELD(struct rte_mempool_cache,
> objs[0]));
>
>         cache->size = size;
> -       cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
>         cache->len = 0;
>  }
>
> @@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n,
> unsigned elt_size,
>
>         /* asked cache too big */
>         if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
> -           CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
> +           cache_size > n) {
>                 rte_errno = EINVAL;
>                 return NULL;
>         }
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 7bdc92b812..0801cec24a 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
>   */
>  struct __rte_cache_aligned rte_mempool_cache {
>         uint32_t size;        /**< Size of the cache */
> -       uint32_t flushthresh; /**< Threshold before we flush excess
> elements */
>         uint32_t len;         /**< Current cache count */
>  #ifdef RTE_LIBRTE_MEMPOOL_STATS
> -       uint32_t unused;
>         /*
>          * Alternative location for the most frequently updated mempool
> statistics (per-lcore),
>          * providing faster update access when using a mempool cache.
> @@ -110,7 +108,7 @@ struct __rte_cache_aligned rte_mempool_cache {
>          * Cache is allocated to this size to allow it to overflow in
> certain
>          * cases to avoid needless emptying of cache.
>          */
> -       alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE
> * 2];
> +       alignas(RTE_CACHE_LINE_SIZE) void
> *objs[RTE_MEMPOOL_CACHE_MAX_SIZE];
>  };
>
>  /**
> @@ -1362,6 +1360,48 @@ rte_mempool_cache_flush(struct rte_mempool_cache
> *cache,
>         cache->len = 0;
>  }
>
> +/**
> + * @internal Put several objects back in the mempool; used internally when
> + *   the number of objects exceeds the remaining space in the mempool
> cache.
> + * @param mp
> + *   A pointer to the mempool structure.
> + * @param obj_table
> + *   A pointer to a table of void * pointers (objects).
> + * @param n
> + *   The number of objects to store back in the mempool, must be strictly
> + *   positive.
> + *   Must be more than the remaining space in the mempool cache, i.e.:
> + *   cache->len + n > cache->size
> + *   Must be less than the size of the mempool cache, i.e.:
> + *   n < cache->size
> + * @param cache
> + *   A pointer to a mempool cache structure. Not NULL.
> + */
> +static void
> +rte_mempool_do_generic_put_split(struct rte_mempool *mp, void * const
> *obj_table,
> +               unsigned int n, struct rte_mempool_cache * const cache)
> +{
> +       void **cache_objs;
> +       unsigned int len;
> +       const uint32_t cache_size = cache->size;
> +
> +       /* Fill the cache with the first objects. */
> +       cache_objs = &cache->objs[cache->len];
> +       len = (cache_size - cache->len);
> +       cache->len = n - len; /* Moved to here (for performance). */
> +       /* rte_ */ memcpy(cache_objs, obj_table, sizeof(void *) * len);
> +       obj_table += len;
> +       n -= len;
> +
> +       /* Flush the entire cache to the backend. */
> +       cache_objs = &cache->objs[0];
> +       rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache_size);
> +
> +       /* Add the remaining objects to the cache. */
> +       /* Moved from here (for performance): cache->len = n; */
> +       /* rte_ */ memcpy(cache_objs, obj_table, sizeof(void *) * n);
> +}
> +
>  /**
>   * @internal Put several objects back in the mempool; used internally.
>   * @param mp
> @@ -1376,52 +1416,44 @@ rte_mempool_cache_flush(struct rte_mempool_cache
> *cache,
>   */
>  static __rte_always_inline void
>  rte_mempool_do_generic_put(struct rte_mempool *mp, void * const
> *obj_table,
> -                          unsigned int n, struct rte_mempool_cache *cache)
> +                          unsigned int n, struct rte_mempool_cache *
> const cache)
>  {
> -       void **cache_objs;
> -
> -       /* No cache provided */
> +       /* No cache provided? */
>         if (unlikely(cache == NULL))
>                 goto driver_enqueue;
>
> -       /* increment stat now, adding in mempool always success */
> +       /* Increment stats now, adding in mempool always succeeds. */
>         RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
>         RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>
> -       /* The request itself is too big for the cache */
> -       if (unlikely(n > cache->flushthresh))
> +       /* The request itself is known to be too big for any cache? */
> +       if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
>                 goto driver_enqueue_stats_incremented;
>
> -       /*
> -        * The cache follows the following algorithm:
> -        *   1. If the objects cannot be added to the cache without
> crossing
> -        *      the flush threshold, flush the cache to the backend.
> -        *   2. Add the objects to the cache.
> -        */
> +       /* Enough remaining space in the cache? */
> +       if (likely(cache->len + n <= cache->size)) {
> +               void **cache_objs;
>
> -       if (cache->len + n <= cache->flushthresh) {
> +               /* Add the objects to the cache. */
>                 cache_objs = &cache->objs[cache->len];
>                 cache->len += n;
> -       } else {
> -               cache_objs = &cache->objs[0];
> -               rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> -               cache->len = n;
> -       }
> -
> -       /* Add the objects to the cache. */
> -       rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> +               rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
> +       } else if (likely(n < cache->size))
> +               rte_mempool_do_generic_put_split(mp, obj_table, n, cache);
> +       else
> +               goto driver_enqueue_stats_incremented;
>
>         return;
>
>  driver_enqueue:
>
> -       /* increment stat now, adding in mempool always success */
> +       /* Increment stats now, adding in mempool always succeeds. */
>         RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>         RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
>
>  driver_enqueue_stats_incremented:
>
> -       /* push objects to the backend */
> +       /* Push the objects directly to the backend. */
>         rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
>  }
>
> @@ -1490,122 +1522,183 @@ rte_mempool_put(struct rte_mempool *mp, void
> *obj)
>  }
>
>  /**
> - * @internal Get several objects from the mempool; used internally.
> + * @internal Get several objects from the mempool; used internally when
> + *   the number of objects exceeds what is available in the mempool cache.
>   * @param mp
>   *   A pointer to the mempool structure.
>   * @param obj_table
>   *   A pointer to a table of void * pointers (objects).
>   * @param n
>   *   The number of objects to get, must be strictly positive.
> + *   Must be more than available in the mempool cache, i.e.:
> + *   n > cache->len
>   * @param cache
> - *   A pointer to a mempool cache structure. May be NULL if not needed.
> + *   A pointer to a mempool cache structure. Not NULL.
>   * @return
>   *   - 0: Success.
>   *   - <0: Error; code of driver dequeue function.
>   */
> -static __rte_always_inline int
> -rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> -                          unsigned int n, struct rte_mempool_cache *cache)
> +static int
> +rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
> +               unsigned int n, struct rte_mempool_cache * const cache)
>  {
>         int ret;
>         unsigned int remaining;
>         uint32_t index, len;
>         void **cache_objs;
> +       const uint32_t cache_size = cache->size;
>
> -       /* No cache provided */
> -       if (unlikely(cache == NULL)) {
> -               remaining = n;
> -               goto driver_dequeue;
> -       }
> -
> -       /* The cache is a stack, so copy will be in reverse order. */
> +       /* Serve the first part of the request from the cache to return
> hot objects first. */
>         cache_objs = &cache->objs[cache->len];
> +       len = cache->len;
> +       remaining = n - len;
> +       for (index = 0; index < len; index++)
> +               *obj_table++ = *--cache_objs;
>
> -       if (__rte_constant(n) && n <= cache->len) {
> +       /* At this point, the cache is empty. */
> +
> +       /* More than can be served from a full cache? */
> +       if (unlikely(remaining >= cache_size)) {
>                 /*
> -                * The request size is known at build time, and
> -                * the entire request can be satisfied from the cache,
> -                * so let the compiler unroll the fixed length copy loop.
> +                * Serve the following part of the request directly from
> the backend
> +                * in multipla of the cache size.
>                  */
> -               cache->len -= n;
> -               for (index = 0; index < n; index++)
> -                       *obj_table++ = *--cache_objs;
> +               len = remaining - remaining % cache_size;
> +               ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
> +               if (unlikely(ret < 0)) {
> +                       /*
> +                        * No further action is required to roll back the
> request,
> +                        * as objects in the cache are intact, and no
> objects have
> +                        * been dequeued from the backend.
> +                        */
>
> -               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> -               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +                       RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> +                       RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>
> -               return 0;
> -       }
> +                       return ret;
> +               }
>
> -       /*
> -        * Use the cache as much as we have to return hot objects first.
> -        * If the request size 'n' is known at build time, the above
> comparison
> -        * ensures that n > cache->len here, so omit RTE_MIN().
> -        */
> -       len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
> -       cache->len -= len;
> -       remaining = n - len;
> -       for (index = 0; index < len; index++)
> -               *obj_table++ = *--cache_objs;
> +               remaining -= len;
> +               obj_table += len;
>
> -       /*
> -        * If the request size 'n' is known at build time, the case
> -        * where the entire request can be satisfied from the cache
> -        * has already been handled above, so omit handling it here.
> -        */
> -       if (!__rte_constant(n) && remaining == 0) {
> -               /* The entire request is satisfied from the cache. */
> +               if (unlikely(remaining == 0)) {
> +                       cache->len = 0;
>
> -               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> -               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +                       RTE_MEMPOOL_CACHE_STAT_ADD(cache,
> get_success_bulk, 1);
> +                       RTE_MEMPOOL_CACHE_STAT_ADD(cache,
> get_success_objs, n);
>
> -               return 0;
> +                       return 0;
> +               }
>         }
>
> -       /* if dequeue below would overflow mem allocated for cache */
> -       if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> -               goto driver_dequeue;
> -
> -       /* Fill the cache from the backend; fetch size + remaining
> objects. */
> -       ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> -                       cache->size + remaining);
> +       /* Fill the entire cache from the backend. */
> +       ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
>         if (unlikely(ret < 0)) {
>                 /*
> -                * We are buffer constrained, and not able to allocate
> -                * cache + remaining.
> -                * Do not fill the cache, just satisfy the remaining part
> of
> -                * the request directly from the backend.
> +                * Unable to fill the cache.
> +                * Last resort: Try only the remaining part of the request,
> +                * served directly from the backend.
>                  */
> -               goto driver_dequeue;
> +               ret = rte_mempool_ops_dequeue_bulk(mp, obj_table,
> remaining);
> +               if (unlikely(ret == 0)) {
> +                       cache->len = 0;
> +
> +                       RTE_MEMPOOL_CACHE_STAT_ADD(cache,
> get_success_bulk, 1);
> +                       RTE_MEMPOOL_CACHE_STAT_ADD(cache,
> get_success_objs, n);
> +
> +                       return 0;
> +               }
> +
> +               /* Roll back. */
> +               if (cache->len + remaining == n) {
> +                       /*
> +                        * No further action is required to roll back the
> request,
> +                        * as objects in the cache are intact, and no
> objects have
> +                        * been dequeued from the backend.
> +                        */
> +               } else {
> +                       /* Update the state of the cache before putting
> back the objects. */
> +                       cache->len = 0;
> +
> +                       len = n - remaining;
> +                       obj_table -= len;
> +                       rte_mempool_do_generic_put(mp, obj_table, len,
> cache);
> +               }
> +
> +               RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
> +               RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
> +
> +               return ret;
>         }
>
> -       /* Satisfy the remaining part of the request from the filled
> cache. */
> -       cache_objs = &cache->objs[cache->size + remaining];
> +       /* Serve the remaining part of the request from the filled cache.
> */
> +       cache_objs = &cache->objs[cache_size];
>         for (index = 0; index < remaining; index++)
>                 *obj_table++ = *--cache_objs;
>
> -       cache->len = cache->size;
> +       cache->len = cache_size - remaining;
>
>         RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
>         RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>
>         return 0;
> +}
>
> -driver_dequeue:
> +/**
> + * @internal Get several objects from the mempool; used internally.
> + * @param mp
> + *   A pointer to the mempool structure.
> + * @param obj_table
> + *   A pointer to a table of void * pointers (objects).
> + * @param n
> + *   The number of objects to get, must be strictly positive.
> + * @param cache
> + *   A pointer to a mempool cache structure. May be NULL if not needed.
> + * @return
> + *   - 0: Success.
> + *   - <0: Error; code of driver dequeue function.
> + */
> +static __rte_always_inline int
> +rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> +                          unsigned int n, struct rte_mempool_cache *
> const cache)
> +{
> +       int ret;
>
> -       /* Get remaining objects directly from the backend. */
> -       ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
> +       /* No cache provided? */
> +       if (unlikely(cache == NULL))
> +               goto driver_dequeue;
>
> -       if (ret < 0) {
> -               if (likely(cache != NULL)) {
> -                       cache->len = n - remaining;
> -                       /*
> -                        * No further action is required to roll the first
> part
> -                        * of the request back into the cache, as objects
> in
> -                        * the cache are intact.
> -                        */
> -               }
> +       /* The request itself is known to be too big for any cache? */
> +       if (__rte_constant(n) && n >= RTE_MEMPOOL_CACHE_MAX_SIZE)
> +               goto driver_dequeue;
> +
> +       /* The request can be served entirely from the cache? */
> +       if (likely(n <= cache->len)) {
> +               unsigned int index;
> +               void **cache_objs;
>
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +
> +               /*
> +                * The cache is a stack, so copy will be in reverse order.
> +                * If the request size is known at build time,
> +                * the compiler will unroll the fixed length copy loop.
> +                */
> +               cache_objs = &cache->objs[cache->len];
> +               cache->len -= n;
> +               for (index = 0; index < n; index++)
> +                       *obj_table++ = *--cache_objs;
> +
> +               return 0;
> +       } else
> +               return rte_mempool_do_generic_get_split(mp, obj_table, n,
> cache);
> +
> +driver_dequeue:
> +
> +       /* Get the objects directly from the backend. */
> +       ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
> +       if (unlikely(ret < 0)) {
>                 RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
>                 RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>         } else {
> --
> 2.43.0
>
>

[-- Attachment #2: Type: text/html, Size: 44870 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v8] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (5 preceding siblings ...)
  2024-09-24 18:12 ` [RFC PATCH v7] " Morten Brørup
@ 2024-09-25 21:33 ` Morten Brørup
  2024-09-26 18:24 ` [RFC PATCH v9] " Morten Brørup
                   ` (11 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-25 21:33 UTC (permalink / raw)
  To: dev; +Cc: Mattias Rönnblom, Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
separate helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This alo made rte_mempool_do_generic_put_split() obsolete.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 ++--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 250 +++++++++++-------
 9 files changed, 196 insertions(+), 222 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..c841a617e0 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1376,52 +1374,53 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
 		goto driver_enqueue_stats_incremented;
 
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
-
 	/* Add the objects to the cache. */
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
+	/* Cache size exceeded? */
+	if (unlikely(cache->len > cache->size)) {
+		/*
+		 * Flush a CPU cache line (or mempool cache) size aligned
+		 * number of objects to the backend, as much as we can.
+		 */
+		if (likely(RTE_CACHE_LINE_SIZE / sizeof(void *) <= cache->size))
+			n = cache->len & ~(RTE_CACHE_LINE_SIZE / sizeof(void *) - 1);
+		else
+			n = cache->len - cache->len % cache->size;
+		cache->len -= n;
+		obj_table = &cache->objs[cache->len];
+		goto driver_enqueue_stats_incremented;
+	}
+
 	return;
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
 	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
 driver_enqueue_stats_incremented:
 
-	/* push objects to the backend */
+	/* Push the objects directly to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1490,122 +1489,183 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static int
+rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
+
+	/* At this point, the cache is empty. */
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of the cache size.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+		len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-		return 0;
-	}
+			return ret;
+		}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+		remaining -= len;
+		obj_table += len;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		if (unlikely(remaining == 0)) {
+			cache->len = 0;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
-		return 0;
+			return 0;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Unable to fill the cache.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			cache->len = 0;
+
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			return 0;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
+		return ret;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+	cache->len = cache_size - remaining;
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
+}
 
-driver_dequeue:
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache * const cache)
+{
+	int ret;
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_dequeue;
 
+	/* The request can be served entirely from the cache? */
+	if (likely(n <= cache->len)) {
+		unsigned int index;
+		void **cache_objs;
+
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+		/*
+		 * The cache is a stack, so copy will be in reverse order.
+		 * If the request size is known at build time,
+		 * the compiler will unroll the fixed length copy loop.
+		 */
+		cache_objs = &cache->objs[cache->len];
+		cache->len -= n;
+		for (index = 0; index < n; index++)
+			*obj_table++ = *--cache_objs;
+
+		return 0;
+	} else
+		return rte_mempool_do_generic_get_split(mp, obj_table, n, cache);
+
+driver_dequeue:
+
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+	if (unlikely(ret < 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v9] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (6 preceding siblings ...)
  2024-09-25 21:33 ` [RFC PATCH v8] " Morten Brørup
@ 2024-09-26 18:24 ` Morten Brørup
  2024-09-26 20:53 ` [RFC PATCH v10] " Morten Brørup
                   ` (10 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-26 18:24 UTC (permalink / raw)
  To: dev; +Cc: Mattias Rönnblom, Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
separate helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 ++--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  12 +-
 lib/mempool/rte_mempool.h                     | 271 ++++++++++--------
 9 files changed, 200 insertions(+), 239 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..40fb13239a 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..c1a3df6eca 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,52 +1374,51 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
 	/* Add the objects to the cache. */
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
-	return;
-
-driver_enqueue:
+	/* Cache size not exceeded? */
+	if (likely(cache->len <= cache->size))
+		return;
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	/*
+	 * Cache size exceeded.
+	 * Flush a CPU cache line (or mempool cache) size aligned
+	 * bulk of objects to the backend, as much as we can.
+	 */
+	if (likely(RTE_CACHE_LINE_SIZE / sizeof(void *) <= cache->size))
+		n = cache->len & ~(RTE_CACHE_LINE_SIZE / sizeof(void *) - 1);
+	else
+		n = cache->len - cache->len % cache->size;
+	cache->len -= n;
+	obj_table = &cache->objs[cache->len];
 
-driver_enqueue_stats_incremented:
+driver_enqueue:
 
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1490,135 +1487,185 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static int
+rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
+
+	/* At this point, the cache is empty. */
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of CPU cache line (or mempool cache) size, as much as we can.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
+		if (likely(RTE_CACHE_LINE_SIZE / sizeof(void *) <= cache_size))
+			len = remaining & ~(RTE_CACHE_LINE_SIZE / sizeof(void *) - 1);
+		else
+			len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+			goto fail;
+		}
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		remaining -= len;
+		obj_table += len;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		if (remaining == 0) {
+			/* Update the state of the cache before returning. */
+			cache->len = 0;
 
-		return 0;
+			goto success;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			/* Update the state of the cache before returning. */
+			cache->len = 0;
+
+			goto success;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		goto fail;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+	cache->len = cache_size - remaining;
+
+success:
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
-driver_dequeue:
+fail:
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
+	return ret;
+}
+
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache * const cache)
+{
+	/* Cache provided? */
+	if (likely(cache != NULL)) {
+		/* The request can be served entirely from the cache? */
+		if (likely(n <= cache->len)) {
+			unsigned int index;
+			void **cache_objs;
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
-	} else {
-		if (likely(cache != NULL)) {
 			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			/*
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_split(mp, obj_table, n, cache);
+	} else {
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v10] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (7 preceding siblings ...)
  2024-09-26 18:24 ` [RFC PATCH v9] " Morten Brørup
@ 2024-09-26 20:53 ` Morten Brørup
  2024-09-28 17:32 ` [RFC PATCH v11] " Morten Brørup
                   ` (9 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-26 20:53 UTC (permalink / raw)
  To: dev; +Cc: Mattias Rönnblom, Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a cache size of N objects, the cache was
actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces two optimizations:
1. The mempool caches are flushed to/filled from the backend in their
entirety, so backend accesses are CPU cache line aligned. (Assuming the
mempool cache size is a multiplum of a CPU cache line size divided by the
size of a pointer.)
2. The unlikely paths in the get and put functions, where the cache is
flushed to/filled from the backend, are moved from the inline functions to
separate helper functions, thereby reducing the code size of the inline
functions.
Note: Accessing the backend for cacheless mempools remains inline.

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 ++--
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 -
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  20 +-
 lib/mempool/rte_mempool.h                     | 270 ++++++++++--------
 9 files changed, 202 insertions(+), 244 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..2ab67bbaeb 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -939,11 +933,9 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, RTE_MEMPOOL_HEADER_SIZE(mp, 0));
 
 	/* Init all default caches. */
-	if (cache_size != 0) {
-		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
-			mempool_cache_init(&mp->local_cache[lcore_id],
-					   cache_size);
-	}
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+		mempool_cache_init(&mp->local_cache[lcore_id],
+				   cache_size);
 
 	te->data = mp;
 
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..64de688312 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,52 +1374,51 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
 	/* Add the objects to the cache. */
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
-	return;
-
-driver_enqueue:
+	/* Cache size not exceeded? */
+	if (likely(cache->len <= cache->size))
+		return;
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	/*
+	 * Cache size exceeded.
+	 * Flush a CPU cache line (or mempool cache) size aligned
+	 * bulk of objects to the backend, as much as we can.
+	 */
+	if (likely(RTE_CACHE_LINE_SIZE / sizeof(void *) <= cache->size))
+		n = cache->len & ~(RTE_CACHE_LINE_SIZE / sizeof(void *) - 1);
+	else
+		n = cache->len - cache->len % cache->size;
+	cache->len -= n;
+	obj_table = &cache->objs[cache->len];
 
-driver_enqueue_stats_incremented:
+driver_enqueue:
 
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1490,135 +1487,184 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
 }
 
 /**
- * @internal Get several objects from the mempool; used internally.
+ * @internal Get several objects from the mempool; used internally when
+ *   the number of objects exceeds what is available in the mempool cache.
  * @param mp
  *   A pointer to the mempool structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
  * @param n
  *   The number of objects to get, must be strictly positive.
+ *   Must be more than available in the mempool cache, i.e.:
+ *   n > cache->len
  * @param cache
- *   A pointer to a mempool cache structure. May be NULL if not needed.
+ *   A pointer to a mempool cache structure. Not NULL.
  * @return
  *   - 0: Success.
  *   - <0: Error; code of driver dequeue function.
  */
-static __rte_always_inline int
-rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+static int
+rte_mempool_do_generic_get_split(struct rte_mempool *mp, void **obj_table,
+		unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
+	const uint32_t cache_size = cache->size;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
-		goto driver_dequeue;
-	}
-
-	/* The cache is a stack, so copy will be in reverse order. */
+	/* Serve the first part of the request from the cache to return hot objects first. */
 	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+	remaining = n - len;
+	for (index = 0; index < len; index++)
+		*obj_table++ = *--cache_objs;
+
+	/* At this point, the cache is empty. */
 
-	if (__rte_constant(n) && n <= cache->len) {
+	/* More than can be served from a full cache? */
+	if (unlikely(remaining >= cache_size)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * Serve the following part of the request directly from the backend
+		 * in multipla of CPU cache line (or mempool cache) size, as much as we can.
 		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
+		if (likely(RTE_CACHE_LINE_SIZE / sizeof(void *) <= cache_size))
+			len = remaining & ~(RTE_CACHE_LINE_SIZE / sizeof(void *) - 1);
+		else
+			len = remaining - remaining % cache_size;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, len);
+		if (unlikely(ret < 0)) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
-		*obj_table++ = *--cache_objs;
+			goto fail;
+		}
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
+		remaining -= len;
+		obj_table += len;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		if (remaining == 0) {
+			/* Update the state of the cache before returning. */
+			cache->len = 0;
 
-		return 0;
+			goto success;
+		}
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	/* Fill the entire cache from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache_size);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Last resort: Try only the remaining part of the request,
+		 * served directly from the backend.
 		 */
-		goto driver_dequeue;
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+		if (unlikely(ret == 0)) {
+			/* Update the state of the cache before returning. */
+			cache->len = 0;
+
+			goto success;
+		}
+
+		/* Roll back. */
+		if (cache->len + remaining == n) {
+			/*
+			 * No further action is required to roll back the request,
+			 * as objects in the cache are intact, and no objects have
+			 * been dequeued from the backend.
+			 */
+		} else {
+			/* Update the state of the cache before putting back the objects. */
+			cache->len = 0;
+
+			len = n - remaining;
+			obj_table -= len;
+			rte_mempool_do_generic_put(mp, obj_table, len, cache);
+		}
+
+		goto fail;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	/* Serve the remaining part of the request from the filled cache. */
+	cache_objs = &cache->objs[cache_size];
+	cache->len = cache_size - remaining;
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+success:
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
-driver_dequeue:
+fail:
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
+	return ret;
+}
+
+/**
+ * @internal Get several objects from the mempool; used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param obj_table
+ *   A pointer to a table of void * pointers (objects).
+ * @param n
+ *   The number of objects to get, must be strictly positive.
+ * @param cache
+ *   A pointer to a mempool cache structure. May be NULL if not needed.
+ * @return
+ *   - 0: Success.
+ *   - <0: Error; code of driver dequeue function.
+ */
+static __rte_always_inline int
+rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
+			   unsigned int n, struct rte_mempool_cache * const cache)
+{
+	/* Cache provided? */
+	if (likely(cache != NULL)) {
+		/* The request can be served entirely from the cache? */
+		if (likely(n <= cache->len)) {
+			unsigned int index;
+			void **cache_objs;
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
-	} else {
-		if (likely(cache != NULL)) {
 			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+
+			/*
+			 * The cache is a stack, so copy will be in reverse order.
+			 * If the request size is known at build time,
+			 * the compiler will unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[cache->len];
+			cache->len -= n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+
+			return 0;
+		} else
+			return rte_mempool_do_generic_get_split(mp, obj_table, n, cache);
+	} else {
+		int ret;
+
+		/* Get the objects directly from the backend. */
+		ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
+		if (unlikely(ret < 0)) {
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 		} else {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
-	}
 
-	return ret;
+		return ret;
+	}
 }
 
 /**
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v11] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (8 preceding siblings ...)
  2024-09-26 20:53 ` [RFC PATCH v10] " Morten Brørup
@ 2024-09-28 17:32 ` Morten Brørup
  2024-09-28 19:38 ` [RFC PATCH v12] " Morten Brørup
                   ` (8 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-28 17:32 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 ++---
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +--
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  20 +-
 lib/mempool/rte_mempool.h                     | 212 +++++++++---------
 9 files changed, 152 insertions(+), 236 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..2ab67bbaeb 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -939,11 +933,9 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, RTE_MEMPOOL_HEADER_SIZE(mp, 0));
 
 	/* Init all default caches. */
-	if (cache_size != 0) {
-		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
-			mempool_cache_init(&mp->local_cache[lcore_id],
-					   cache_size);
-	}
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+		mempool_cache_init(&mp->local_cache[lcore_id],
+				   cache_size);
 
 	te->data = mp;
 
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..c8ef7ee53b 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,52 +1374,55 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
+		goto driver_enqueue;
 
 	/* Add the objects to the cache. */
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
-	return;
-
-driver_enqueue:
+	/* Cache size not exceeded? */
+	if (likely(cache->len <= cache->size))
+		return;
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	/*
+	 * Cache size exceeded.
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, as much as we can.
+	 */
+	if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		n = RTE_ALIGN_FLOOR(cache->len, RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		n = cache->len;
+	cache->len -= n;
+	obj_table = &cache->objs[cache->len];
 
-driver_enqueue_stats_incremented:
+driver_enqueue:
 
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1505,86 +1506,89 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
+
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_dequeue;
+
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
 		goto driver_dequeue;
-	}
 
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			goto cache_success;
+		} else {
+			remaining = n;
 
-		return 0;
+			goto cache_dequeue;
+		}
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend; fetch cache size + remaining objects.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(cache->size + remaining,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = cache->size + remaining;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Retry; fetch only the remaining objects.
+		 * Round up to a CPU cache line size aligned bulk, if mempool cache size allows.
 		 */
-		goto driver_dequeue;
+		if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+			len = RTE_ALIGN_CEIL(remaining, RTE_CACHE_LINE_SIZE / sizeof(void *));
+		else
+			len = remaining;
+		ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+		if (unlikely(ret < 0))
+			goto fail;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+cache_dequeue:
+
+	/* Serve the remaining part of the request from the cache. */
+	cache_objs = &cache->objs[len];
+	cache->len = len - remaining;
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+cache_success:
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1593,31 +1597,31 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 driver_dequeue:
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
-	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-		} else {
+	if (likely(ret == 0)) {
+		if (unlikely(cache == NULL)) {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+
+			return 0;
 		}
+
+		goto cache_success;
 	}
 
+fail:
+
+	/*
+	 * No further action is required to roll back the request,
+	 * as objects in the cache are intact, and no objects have
+	 * been dequeued from the backend.
+	 */
+
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
 	return ret;
 }
 
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v12] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (9 preceding siblings ...)
  2024-09-28 17:32 ` [RFC PATCH v11] " Morten Brørup
@ 2024-09-28 19:38 ` Morten Brørup
  2024-10-01 11:33 ` [RFC PATCH v13] " Morten Brørup
                   ` (7 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-09-28 19:38 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  54 ++---
 drivers/mempool/dpaa/dpaa_mempool.c           |  16 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  17 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  27 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  27 +--
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  17 +-
 lib/mempool/rte_mempool.h                     | 212 +++++++++---------
 9 files changed, 152 insertions(+), 233 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..98535a48f3 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1024,21 +1024,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (cache == NULL || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,16 +1048,13 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m != NULL)) {
 		free[0] = m;
@@ -1335,21 +1324,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,16 +1348,13 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..3a936826c8 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
@@ -234,7 +220,7 @@ dpaa_mbuf_alloc_bulk(struct rte_mempool *pool,
 	DPAA_MEMPOOL_DPDEBUG("Request to alloc %d buffers in bpid = %d",
 			     count, bp_info->bpid);
 
-	if (unlikely(count >= (RTE_MEMPOOL_CACHE_MAX_SIZE * 2))) {
+	if (unlikely(count >= RTE_MEMPOOL_CACHE_MAX_SIZE)) {
 		DPAA_MEMPOOL_ERR("Unable to allocate requested (%u) buffers",
 				 count);
 		return -1;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..712ab1726f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,13 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +807,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..307bb8556a 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1873,21 +1873,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,16 +1897,13 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..4ea1db734e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -888,21 +888,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || cache->len == 0)
-			goto normal;
-
-		cache_objs = &cache->objs[cache->len];
-
-		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
-			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+		if (!cache || unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		cache_objs = &cache->objs[cache->len];
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,16 +912,13 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
-normal:
 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 	if (likely(m)) {
 		free[0] = m;
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..11dae53b02 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -1046,8 +1040,9 @@ rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp)
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		cache_count = mp->local_cache[lcore_id].len;
-		fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
-			lcore_id, cache_count);
+		if (cache_count > 0)
+			fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
+				lcore_id, cache_count);
 		count += cache_count;
 	}
 	fprintf(f, "    total_cache_count=%u\n", count);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..c8ef7ee53b 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,52 +1374,55 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
+		goto driver_enqueue;
 
 	/* Add the objects to the cache. */
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
-	return;
-
-driver_enqueue:
+	/* Cache size not exceeded? */
+	if (likely(cache->len <= cache->size))
+		return;
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	/*
+	 * Cache size exceeded.
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, as much as we can.
+	 */
+	if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		n = RTE_ALIGN_FLOOR(cache->len, RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		n = cache->len;
+	cache->len -= n;
+	obj_table = &cache->objs[cache->len];
 
-driver_enqueue_stats_incremented:
+driver_enqueue:
 
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1505,86 +1506,89 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
+
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_dequeue;
+
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
 		goto driver_dequeue;
-	}
 
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			goto cache_success;
+		} else {
+			remaining = n;
 
-		return 0;
+			goto cache_dequeue;
+		}
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend; fetch cache size + remaining objects.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(cache->size + remaining,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = cache->size + remaining;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Retry; fetch only the remaining objects.
+		 * Round up to a CPU cache line size aligned bulk, if mempool cache size allows.
 		 */
-		goto driver_dequeue;
+		if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+			len = RTE_ALIGN_CEIL(remaining, RTE_CACHE_LINE_SIZE / sizeof(void *));
+		else
+			len = remaining;
+		ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+		if (unlikely(ret < 0))
+			goto fail;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+cache_dequeue:
+
+	/* Serve the remaining part of the request from the cache. */
+	cache_objs = &cache->objs[len];
+	cache->len = len - remaining;
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+cache_success:
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1593,31 +1597,31 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 driver_dequeue:
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
-	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-		} else {
+	if (likely(ret == 0)) {
+		if (unlikely(cache == NULL)) {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+
+			return 0;
 		}
+
+		goto cache_success;
 	}
 
+fail:
+
+	/*
+	 * No further action is required to roll back the request,
+	 * as objects in the cache are intact, and no objects have
+	 * been dequeued from the backend.
+	 */
+
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
 	return ret;
 }
 
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v13] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (10 preceding siblings ...)
  2024-09-28 19:38 ` [RFC PATCH v12] " Morten Brørup
@ 2024-10-01 11:33 ` Morten Brørup
  2024-10-01 13:41 ` [RFC PATCH v14] " Morten Brørup
                   ` (6 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-10-01 11:33 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  42 ++--
 drivers/mempool/dpaa/dpaa_mempool.c           |  14 --
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  18 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  20 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  20 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  17 +-
 lib/mempool/rte_mempool.h                     | 213 +++++++++---------
 9 files changed, 158 insertions(+), 201 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..d8d0ed1cdc 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1034,11 +1034,11 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,12 +1056,10 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
@@ -1335,7 +1333,7 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 								rte_lcore_id());
 		void **cache_objs;
 
-		if (!cache || cache->len == 0)
+		if (cache == NULL || cache->len == 0)
 			goto normal;
 
 		cache_objs = &cache->objs[cache->len];
@@ -1345,11 +1343,11 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,12 +1365,10 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..7490862809 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..321d872e6c 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,14 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE ||
+				unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +808,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..aff1addecf 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1883,11 +1883,11 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,12 +1905,10 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..3e090af659 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -898,11 +898,11 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,12 +920,10 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..11dae53b02 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -1046,8 +1040,9 @@ rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp)
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		cache_count = mp->local_cache[lcore_id].len;
-		fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
-			lcore_id, cache_count);
+		if (cache_count > 0)
+			fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
+				lcore_id, cache_count);
 		count += cache_count;
 	}
 	fprintf(f, "    total_cache_count=%u\n", count);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..5f34584718 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,52 +1374,56 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
-		cache_objs = &cache->objs[cache->len];
-		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
-	}
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
+		goto driver_enqueue;
 
 	/* Add the objects to the cache. */
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
-	return;
-
-driver_enqueue:
+	/* Cache size not exceeded? */
+	if (likely(cache->len <= cache->size))
+		return;
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	/*
+	 * Cache size exceeded.
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, so ca. 1/2 cache size remains.
+	 */
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		n = RTE_ALIGN_FLOOR(cache->len - cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		n = cache->len - cache->size / 2;
+	cache->len -= n;
+	obj_table = &cache->objs[cache->len];
 
-driver_enqueue_stats_incremented:
+driver_enqueue:
 
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1505,86 +1507,89 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
+
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_dequeue;
+
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
 		goto driver_dequeue;
-	}
 
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			goto cache_success;
+		} else {
+			remaining = n;
 
-		return 0;
+			goto cache_dequeue;
+		}
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend; fetch remaining objects + ca. 1/2 cache size.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(remaining + cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = remaining + cache->size / 2;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Retry; fetch only the remaining objects.
+		 * Round up to a CPU cache line size aligned bulk, if mempool cache size allows.
 		 */
-		goto driver_dequeue;
+		if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+			len = RTE_ALIGN_CEIL(remaining, RTE_CACHE_LINE_SIZE / sizeof(void *));
+		else
+			len = remaining;
+		ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+		if (unlikely(ret < 0))
+			goto fail;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+cache_dequeue:
+
+	/* Serve the remaining part of the request from the cache. */
+	cache_objs = &cache->objs[len];
+	cache->len = len - remaining;
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+cache_success:
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1593,31 +1598,31 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 driver_dequeue:
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
-	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-		} else {
+	if (likely(ret == 0)) {
+		if (unlikely(cache == NULL)) {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+
+			return 0;
 		}
+
+		goto cache_success;
 	}
 
+fail:
+
+	/*
+	 * No further action is required to roll back the request,
+	 * as objects in the cache are intact, and no objects have
+	 * been dequeued from the backend.
+	 */
+
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
 	return ret;
 }
 
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v14] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (11 preceding siblings ...)
  2024-10-01 11:33 ` [RFC PATCH v13] " Morten Brørup
@ 2024-10-01 13:41 ` Morten Brørup
  2024-10-02 11:25 ` [RFC PATCH v15] " Morten Brørup
                   ` (5 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-10-01 13:41 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  42 ++--
 drivers/mempool/dpaa/dpaa_mempool.c           |  14 --
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  19 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  21 +-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  21 +-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  17 +-
 lib/mempool/rte_mempool.h                     | 213 ++++++++++--------
 9 files changed, 166 insertions(+), 196 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..6e5bf9f4e5 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1034,11 +1034,12 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1056,12 +1057,10 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
@@ -1345,11 +1344,12 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1367,12 +1367,10 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..7490862809 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..bfdc8e0ab2 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,18 +783,15 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		/* No cache, too large request, or insufficient free space in the cache? */
+		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE ||
+				unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -812,12 +809,10 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..a4dd477431 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1883,11 +1883,12 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -1905,12 +1906,10 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..f58a74d7d7 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -898,11 +898,12 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
 		/* Add elements back into the cache */
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
@@ -920,12 +921,10 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		}
 		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
+		/* Increment stat. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		goto done;
 	}
 
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..11dae53b02 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -1046,8 +1040,9 @@ rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp)
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		cache_count = mp->local_cache[lcore_id].len;
-		fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
-			lcore_id, cache_count);
+		if (cache_count > 0)
+			fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
+				lcore_id, cache_count);
 		count += cache_count;
 	}
 	fprintf(f, "    total_cache_count=%u\n", count);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..3cfa9e34d7 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,38 +1374,56 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
+	uint32_t len;
+
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
+	/* Enough free space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+		goto cache_enqueue;
 	}
 
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size / 2))
+		goto driver_enqueue;
+
+	/*
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, so ca. 1/2 cache size will remain
+	 * after adding the objects to the cache.
+	 */
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(cache->len + n - cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = cache->len + n - cache->size / 2;
+	cache->len -= len;
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, len);
+
+cache_enqueue:
+
 	/* Add the objects to the cache. */
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
@@ -1415,13 +1431,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
-
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1505,86 +1515,89 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
-	if (unlikely(cache == NULL)) {
-		remaining = n;
+	/* No cache provided? */
+	if (unlikely(cache == NULL))
+		goto driver_dequeue;
+
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n >= RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_dequeue;
+
+	/* The request itself warrants bypassing the cache? */
+	if (unlikely(n >= cache->size))
 		goto driver_dequeue;
-	}
 
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
 
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+			goto cache_success;
+		} else {
+			remaining = n;
 
-		return 0;
+			goto cache_dequeue;
+		}
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend; fetch remaining objects + ca. 1/2 cache size.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(remaining + cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = remaining + cache->size / 2;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
+		 * Retry; fetch only the remaining objects.
+		 * Round up to a CPU cache line size aligned bulk, if mempool cache size allows.
 		 */
-		goto driver_dequeue;
+		if (likely(cache->size >= RTE_CACHE_LINE_SIZE / sizeof(void *)))
+			len = RTE_ALIGN_CEIL(remaining, RTE_CACHE_LINE_SIZE / sizeof(void *));
+		else
+			len = remaining;
+		ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+		if (unlikely(ret < 0))
+			goto fail;
 	}
 
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+cache_dequeue:
+
+	/* Serve the remaining part of the request from the cache. */
+	cache_objs = &cache->objs[len];
+	cache->len = len - remaining;
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
+cache_success:
 
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1593,31 +1606,31 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 driver_dequeue:
 
-	/* Get remaining objects directly from the backend. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
+	/* Get the objects directly from the backend. */
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
-	if (ret < 0) {
-		if (likely(cache != NULL)) {
-			cache->len = n - remaining;
-			/*
-			 * No further action is required to roll the first part
-			 * of the request back into the cache, as objects in
-			 * the cache are intact.
-			 */
-		}
-
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
-	} else {
-		if (likely(cache != NULL)) {
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-		} else {
+	if (likely(ret == 0)) {
+		if (unlikely(cache == NULL)) {
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+
+			return 0;
 		}
+
+		goto cache_success;
 	}
 
+fail:
+
+	/*
+	 * No further action is required to roll back the request,
+	 * as objects in the cache are intact, and no objects have
+	 * been dequeued from the backend.
+	 */
+
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
+
 	return ret;
 }
 
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v15] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (12 preceding siblings ...)
  2024-10-01 13:41 ` [RFC PATCH v14] " Morten Brørup
@ 2024-10-02 11:25 ` Morten Brørup
  2024-10-02 14:35 ` [RFC PATCH v16] " Morten Brørup
                   ` (4 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-10-02 11:25 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v15:
* Changed back cache bypass limit from n >= RTE_MEMPOOL_CACHE_MAX_SIZE to
  n > RTE_MEMPOOL_CACHE_MAX_SIZE.
* Removed cache size limit from serving via cache.
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  46 ++---
 drivers/mempool/dpaa/dpaa_mempool.c           |  14 --
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  21 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  23 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  23 +--
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  17 +-
 lib/mempool/rte_mempool.h                     | 191 +++++++++---------
 9 files changed, 156 insertions(+), 194 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..add1cc86c2 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1034,12 +1034,18 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1054,14 +1060,7 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
@@ -1345,12 +1344,18 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1365,14 +1370,7 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..7490862809 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..2599084dc8 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,19 +783,21 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		/* No cache, too large request, or insufficient free space in the cache? */
+		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE ||
+				unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -810,14 +812,7 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..a1dca398ed 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1883,12 +1883,18 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1903,14 +1909,7 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..3aea6ff6ec 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -898,12 +898,18 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -918,14 +924,7 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..11dae53b02 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -1046,8 +1040,9 @@ rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp)
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		cache_count = mp->local_cache[lcore_id].len;
-		fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
-			lcore_id, cache_count);
+		if (cache_count > 0)
+			fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
+				lcore_id, cache_count);
 		count += cache_count;
 	}
 	fprintf(f, "    total_cache_count=%u\n", count);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..fa28f20242 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,38 +1374,56 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
+	uint32_t len;
+
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
+	/* Enough free space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+		goto cache_enqueue;
 	}
 
+	/* The request itself is too big for the cache? */
+	if (unlikely(n > cache->size))
+		goto driver_enqueue;
+
+	/*
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, so ca. 1/2 cache size will remain
+	 * after adding the objects to the cache.
+	 */
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(cache->len + n - cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = cache->len + n - cache->size / 2;
+	cache->len -= len;
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, len);
+
+cache_enqueue:
+
 	/* Add the objects to the cache. */
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
@@ -1415,13 +1431,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
-
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1440,7 +1450,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
  */
 static __rte_always_inline void
 rte_mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			unsigned int n, struct rte_mempool_cache *cache)
+			unsigned int n, struct rte_mempool_cache * const cache)
 {
 	rte_mempool_trace_generic_put(mp, obj_table, n, cache);
 	RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 0);
@@ -1465,8 +1475,7 @@ static __rte_always_inline void
 rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 		     unsigned int n)
 {
-	struct rte_mempool_cache *cache;
-	cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	struct rte_mempool_cache * const cache = rte_mempool_default_cache(mp, rte_lcore_id());
 	rte_mempool_trace_put_bulk(mp, obj_table, n, cache);
 	rte_mempool_generic_put(mp, obj_table, n, cache);
 }
@@ -1505,31 +1514,50 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL)) {
 		remaining = n;
 		goto driver_dequeue;
 	}
 
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE)) {
+		remaining = n;
+		goto driver_dequeue;
+	}
+
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+		} else {
+			remaining = n;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+cache_dequeue:
+
+			/* Serve the remaining part of the request from the cache. */
+			cache_objs = &cache->objs[len];
+			cache->len = len - remaining;
+			for (index = 0; index < remaining; index++)
+				*obj_table++ = *--cache_objs;
+		}
 
 		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1537,59 +1565,36 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		return 0;
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend;
+	 * fetch the remaining part of the request + ca. 1/2 cache size.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(remaining + cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = remaining + cache->size / 2;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+	if (likely(ret == 0)) {
+		/* Serve the remaining part of the request from the cache. */
+		goto cache_dequeue;
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
-	if (unlikely(ret < 0)) {
-		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
-		 */
-		goto driver_dequeue;
-	}
-
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
-
-	cache->len = cache->size;
-
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-	return 0;
+	/*
+	 * We are buffer constrained, and not able to fetch
+	 * cache + remaining.
+	 * Do not fill the cache, just serve the remaining part of
+	 * the request directly from the backend.
+	 */
 
 driver_dequeue:
 
@@ -1597,7 +1602,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
 
 	if (ret < 0) {
-		if (likely(cache != NULL)) {
+		/* The first part of the request was served from the cache? */
+		if (likely(cache != NULL) && remaining != n) {
 			cache->len = n - remaining;
 			/*
 			 * No further action is required to roll the first part
@@ -1643,7 +1649,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
  */
 static __rte_always_inline int
 rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
-			unsigned int n, struct rte_mempool_cache *cache)
+			unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
@@ -1678,8 +1684,7 @@ rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 static __rte_always_inline int
 rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
-	struct rte_mempool_cache *cache;
-	cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	struct rte_mempool_cache * const cache = rte_mempool_default_cache(mp, rte_lcore_id());
 	rte_mempool_trace_get_bulk(mp, obj_table, n, cache);
 	return rte_mempool_generic_get(mp, obj_table, n, cache);
 }
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v16] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (13 preceding siblings ...)
  2024-10-02 11:25 ` [RFC PATCH v15] " Morten Brørup
@ 2024-10-02 14:35 ` Morten Brørup
  2024-10-02 15:00 ` [RFC PATCH v17] " Morten Brørup
                   ` (3 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-10-02 14:35 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v16:
* Fix bug in rte_mempool_do_generic_put() regarding criteria for flush.
v15:
* Changed back cache bypass limit from n >= RTE_MEMPOOL_CACHE_MAX_SIZE to
  n > RTE_MEMPOOL_CACHE_MAX_SIZE.
* Removed cache size limit from serving via cache.
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  46 ++---
 drivers/mempool/dpaa/dpaa_mempool.c           |  14 --
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  21 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  23 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  23 +--
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  17 +-
 lib/mempool/rte_mempool.h                     | 191 +++++++++---------
 9 files changed, 156 insertions(+), 194 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..add1cc86c2 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -1034,12 +1034,18 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1054,14 +1060,7 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
@@ -1345,12 +1344,18 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1365,14 +1370,7 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..7490862809 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..2599084dc8 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,19 +783,21 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		/* No cache, too large request, or insufficient free space in the cache? */
+		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE ||
+				unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
 		cache_objs = &cache->objs[cache->len];
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -810,14 +812,7 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..a1dca398ed 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1883,12 +1883,18 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1903,14 +1909,7 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..3aea6ff6ec 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -898,12 +898,18 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -918,14 +924,7 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..11dae53b02 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -1046,8 +1040,9 @@ rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp)
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		cache_count = mp->local_cache[lcore_id].len;
-		fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
-			lcore_id, cache_count);
+		if (cache_count > 0)
+			fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
+				lcore_id, cache_count);
 		count += cache_count;
 	}
 	fprintf(f, "    total_cache_count=%u\n", count);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..1b0a6dcd87 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,38 +1374,56 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
+	uint32_t len;
+
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
+	/* Enough free space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+		goto cache_enqueue;
 	}
 
+	/* The request is too big for the cache flush algorithm? */
+	if (unlikely(n > cache->size / 2))
+		goto driver_enqueue;
+
+	/*
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, so ca. 1/2 cache size will remain
+	 * after adding the objects to the cache.
+	 */
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(cache->len + n - cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = cache->len + n - cache->size / 2;
+	cache->len -= len;
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, len);
+
+cache_enqueue:
+
 	/* Add the objects to the cache. */
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
@@ -1415,13 +1431,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
-
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1440,7 +1450,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
  */
 static __rte_always_inline void
 rte_mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			unsigned int n, struct rte_mempool_cache *cache)
+			unsigned int n, struct rte_mempool_cache * const cache)
 {
 	rte_mempool_trace_generic_put(mp, obj_table, n, cache);
 	RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 0);
@@ -1465,8 +1475,7 @@ static __rte_always_inline void
 rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 		     unsigned int n)
 {
-	struct rte_mempool_cache *cache;
-	cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	struct rte_mempool_cache * const cache = rte_mempool_default_cache(mp, rte_lcore_id());
 	rte_mempool_trace_put_bulk(mp, obj_table, n, cache);
 	rte_mempool_generic_put(mp, obj_table, n, cache);
 }
@@ -1505,31 +1514,50 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL)) {
 		remaining = n;
 		goto driver_dequeue;
 	}
 
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE)) {
+		remaining = n;
+		goto driver_dequeue;
+	}
+
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+		} else {
+			remaining = n;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+cache_dequeue:
+
+			/* Serve the remaining part of the request from the cache. */
+			cache_objs = &cache->objs[len];
+			cache->len = len - remaining;
+			for (index = 0; index < remaining; index++)
+				*obj_table++ = *--cache_objs;
+		}
 
 		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1537,59 +1565,36 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		return 0;
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend;
+	 * fetch the remaining part of the request + ca. 1/2 cache size.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(remaining + cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = remaining + cache->size / 2;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+	if (likely(ret == 0)) {
+		/* Serve the remaining part of the request from the cache. */
+		goto cache_dequeue;
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
-	if (unlikely(ret < 0)) {
-		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
-		 */
-		goto driver_dequeue;
-	}
-
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
-
-	cache->len = cache->size;
-
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-	return 0;
+	/*
+	 * We are buffer constrained, and not able to fetch
+	 * cache + remaining.
+	 * Do not fill the cache, just serve the remaining part of
+	 * the request directly from the backend.
+	 */
 
 driver_dequeue:
 
@@ -1597,7 +1602,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
 
 	if (ret < 0) {
-		if (likely(cache != NULL)) {
+		/* The first part of the request was served from the cache? */
+		if (likely(cache != NULL) && remaining != n) {
 			cache->len = n - remaining;
 			/*
 			 * No further action is required to roll the first part
@@ -1643,7 +1649,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
  */
 static __rte_always_inline int
 rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
-			unsigned int n, struct rte_mempool_cache *cache)
+			unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
@@ -1678,8 +1684,7 @@ rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 static __rte_always_inline int
 rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
-	struct rte_mempool_cache *cache;
-	cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	struct rte_mempool_cache * const cache = rte_mempool_default_cache(mp, rte_lcore_id());
 	rte_mempool_trace_get_bulk(mp, obj_table, n, cache);
 	return rte_mempool_generic_get(mp, obj_table, n, cache);
 }
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v17] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (14 preceding siblings ...)
  2024-10-02 14:35 ` [RFC PATCH v16] " Morten Brørup
@ 2024-10-02 15:00 ` Morten Brørup
  2025-02-21 15:13 ` [RFC PATCH v18] " Morten Brørup
                   ` (2 subsequent siblings)
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2024-10-02 15:00 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

This patch refactors the mempool cache to fix two bugs:
1. When a mempool is created with a per-lcore cache size of N objects, the
per-lcore caches were actually created with a size of 1.5 * N objects.
2. The mempool cache field names did not reflect their purpose;
the "flushthresh" field held the size, and the "size" field held the
number of objects remaining in the cache when returning from a get
operation refilling it from the backend.

Especially the first item could be fatal:
When more objects than a mempool's configured cache size is held in the
mempool's caches associated with other lcores, a rightsized mempool may
unexpectedly run out of objects, causing the application to fail.

Furthermore, this patch introduces some optimizations.
(Work in progress. Details to follow later. Submitting to get CI
performance data.)

Various drivers accessing the mempool directly have been updated
accordingly.
These drivers did not update mempool statistics when accessing the mempool
directly, so that is fixed too.

Note: Performance not yet benchmarked.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v17:
* Update rearm in idpf driver.
v16:
* Fix bug in rte_mempool_do_generic_put() regarding criteria for flush.
v15:
* Changed back cache bypass limit from n >= RTE_MEMPOOL_CACHE_MAX_SIZE to
  n > RTE_MEMPOOL_CACHE_MAX_SIZE.
* Removed cache size limit from serving via cache.
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 drivers/common/idpf/idpf_common_rxtx_avx512.c |  60 +++---
 drivers/mempool/dpaa/dpaa_mempool.c           |  14 --
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  14 --
 drivers/net/i40e/i40e_rxtx_vec_avx512.c       |  22 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c       |  26 ++-
 drivers/net/ice/ice_rxtx_vec_avx512.c         |  26 ++-
 lib/mempool/mempool_trace.h                   |   1 -
 lib/mempool/rte_mempool.c                     |  17 +-
 lib/mempool/rte_mempool.h                     | 191 +++++++++---------
 9 files changed, 164 insertions(+), 207 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx_avx512.c b/drivers/common/idpf/idpf_common_rxtx_avx512.c
index 3b5e124ec8..f3baf5a11f 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx512.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx512.c
@@ -148,8 +148,8 @@ idpf_singleq_rearm(struct idpf_rx_queue *rxq)
 	/* Can this be satisfied from the cache? */
 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
 		/* No. Backfill the cache first, and then fill from it */
-		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
-							cache->len);
+		uint32_t req = IDPF_RXQ_REARM_THRESH + cache->size / 2 -
+							cache->len;
 
 		/* How many do we require i.e. number to fill the cache + the request */
 		int ret = rte_mempool_ops_dequeue_bulk
@@ -618,8 +618,8 @@ idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq)
 	/* Can this be satisfied from the cache? */
 	if (cache->len < IDPF_RXQ_REARM_THRESH) {
 		/* No. Backfill the cache first, and then fill from it */
-		uint32_t req = IDPF_RXQ_REARM_THRESH + (cache->size -
-							cache->len);
+		uint32_t req = IDPF_RXQ_REARM_THRESH + cache->size / 2 -
+							cache->len;
 
 		/* How many do we require i.e. number to fill the cache + the request */
 		int ret = rte_mempool_ops_dequeue_bulk
@@ -1027,19 +1027,24 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		if (cache == NULL || cache->len == 0)
 			goto normal;
 
-		cache_objs = &cache->objs[cache->len];
-
 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache_objs = &cache->objs[cache->len];
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1054,14 +1059,7 @@ idpf_tx_singleq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
@@ -1338,19 +1336,24 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 		if (!cache || cache->len == 0)
 			goto normal;
 
-		cache_objs = &cache->objs[cache->len];
-
 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache_objs = &cache->objs[cache->len];
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1365,14 +1368,7 @@ idpf_tx_splitq_free_bufs_avx512(struct idpf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 74bfcab509..7490862809 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -51,8 +51,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	struct bman_pool_params params = {
 		.flags = BMAN_POOL_FLAG_DYNAMIC_BPID
 	};
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 
 	MEMPOOL_INIT_FUNC_TRACE();
 
@@ -120,18 +118,6 @@ dpaa_mbuf_create_pool(struct rte_mempool *mp)
 	rte_memcpy(bp_info, (void *)&rte_dpaa_bpid_info[bpid],
 		   sizeof(struct dpaa_bp_info));
 	mp->pool_data = (void *)bp_info;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA_MBUF_MAX_ACQ_REL;
-	}
 
 	DPAA_MEMPOOL_INFO("BMAN pool created for bpid =%d", bpid);
 	return 0;
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 42e17d984c..a44f3cf616 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -44,8 +44,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	struct dpaa2_bp_info *bp_info;
 	struct dpbp_attr dpbp_attr;
 	uint32_t bpid;
-	unsigned int lcore_id;
-	struct rte_mempool_cache *cache;
 	int ret;
 
 	avail_dpbp = dpaa2_alloc_dpbp_dev();
@@ -134,18 +132,6 @@ rte_hw_mbuf_create_pool(struct rte_mempool *mp)
 	DPAA2_MEMPOOL_DEBUG("BP List created for bpid =%d", dpbp_attr.bpid);
 
 	h_bp_list = bp_list;
-	/* Update per core mempool cache threshold to optimal value which is
-	 * number of buffers that can be released to HW buffer pool in
-	 * a single API call.
-	 */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-		cache = &mp->local_cache[lcore_id];
-		DPAA2_MEMPOOL_DEBUG("lCore %d: cache->flushthresh %d -> %d",
-			lcore_id, cache->flushthresh,
-			(uint32_t)(cache->size + DPAA2_MBUF_MAX_ACQ_REL));
-		if (cache->flushthresh)
-			cache->flushthresh = cache->size + DPAA2_MBUF_MAX_ACQ_REL;
-	}
 
 	return 0;
 err3:
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 0238b03f8a..9822f2b011 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -783,19 +783,20 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 				rte_lcore_id());
 
-		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+		/* No cache, too large request, or insufficient free space in the cache? */
+		if (!cache || n > RTE_MEMPOOL_CACHE_MAX_SIZE ||
+				unlikely(n + cache->len > cache->size)) {
 			rte_mempool_generic_put(mp, (void *)txep, n, cache);
 			goto done;
 		}
 
-		cache_objs = &cache->objs[cache->len];
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
 		/* Add elements back into the cache */
+		cache_objs = &cache->objs[cache->len];
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -810,14 +811,7 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 3bb6f305df..ec0094af90 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1876,19 +1876,24 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 		if (!cache || cache->len == 0)
 			goto normal;
 
-		cache_objs = &cache->objs[cache->len];
-
 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it crosses the
-		 *   cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache_objs = &cache->objs[cache->len];
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -1903,14 +1908,7 @@ iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk(mp,
-						     &cache->objs[cache->size],
-						     cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 04148e8ea2..72d109d4f0 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -891,19 +891,24 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 		if (!cache || cache->len == 0)
 			goto normal;
 
-		cache_objs = &cache->objs[cache->len];
-
 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
 			goto done;
 		}
 
-		/* The cache follows the following algorithm
-		 *   1. Add the objects to the cache
-		 *   2. Anything greater than the cache min value (if it
-		 *   crosses the cache flush threshold) is flushed to the ring.
-		 */
+		/* Insufficient free space in the cache? */
+		if (unlikely(n + cache->len > cache->size)) {
+			rte_mempool_generic_put(mp, (void *)txep, n, cache);
+			goto done;
+		}
+
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
+
 		/* Add elements back into the cache */
+		cache_objs = &cache->objs[cache->len];
+		cache->len += n;
 		uint32_t copied = 0;
 		/* n is multiple of 32 */
 		while (copied < n) {
@@ -918,14 +923,7 @@ ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
 
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				 cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/lib/mempool/mempool_trace.h b/lib/mempool/mempool_trace.h
index dffef062e4..3c49b41a6d 100644
--- a/lib/mempool/mempool_trace.h
+++ b/lib/mempool/mempool_trace.h
@@ -112,7 +112,6 @@ RTE_TRACE_POINT(
 	rte_trace_point_emit_i32(socket_id);
 	rte_trace_point_emit_ptr(cache);
 	rte_trace_point_emit_u32(cache->len);
-	rte_trace_point_emit_u32(cache->flushthresh);
 )
 
 RTE_TRACE_POINT(
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index d8e39e5c20..11dae53b02 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,11 +50,6 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
- */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
-
 #if defined(RTE_ARCH_X86)
 /*
  * return the greatest common divisor between a and b (fast algorithm)
@@ -746,13 +741,12 @@ rte_mempool_free(struct rte_mempool *mp)
 static void
 mempool_cache_init(struct rte_mempool_cache *cache, uint32_t size)
 {
-	/* Check that cache have enough space for flush threshold */
-	RTE_BUILD_BUG_ON(CALC_CACHE_FLUSHTHRESH(RTE_MEMPOOL_CACHE_MAX_SIZE) >
+	/* Check that cache have enough space for size */
+	RTE_BUILD_BUG_ON(RTE_MEMPOOL_CACHE_MAX_SIZE >
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs) /
 			 RTE_SIZEOF_FIELD(struct rte_mempool_cache, objs[0]));
 
 	cache->size = size;
-	cache->flushthresh = CALC_CACHE_FLUSHTHRESH(size);
 	cache->len = 0;
 }
 
@@ -836,7 +830,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 
 	/* asked cache too big */
 	if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
-	    CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
+	    cache_size > n) {
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -1046,8 +1040,9 @@ rte_mempool_dump_cache(FILE *f, const struct rte_mempool *mp)
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		cache_count = mp->local_cache[lcore_id].len;
-		fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
-			lcore_id, cache_count);
+		if (cache_count > 0)
+			fprintf(f, "    cache_count[%u]=%"PRIu32"\n",
+				lcore_id, cache_count);
 		count += cache_count;
 	}
 	fprintf(f, "    total_cache_count=%u\n", count);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 7bdc92b812..1b0a6dcd87 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -89,10 +89,8 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
  */
 struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
-	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	uint32_t unused;
 	/*
 	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
 	 * providing faster update access when using a mempool cache.
@@ -1030,7 +1028,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1376,38 +1374,56 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
  */
 static __rte_always_inline void
 rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	void **cache_objs;
+	uint32_t len;
+
+	/* No cache provided? */
+	if (unlikely(cache == NULL)) {
+		/* Increment stats now, adding in mempool always succeeds. */
+		RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+		RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
 
-	/* No cache provided */
-	if (unlikely(cache == NULL))
 		goto driver_enqueue;
+	}
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE))
+		goto driver_enqueue;
 
-	if (cache->len + n <= cache->flushthresh) {
+	/* Enough free space in the cache? */
+	if (likely(cache->len + n <= cache->size)) {
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
-		cache_objs = &cache->objs[0];
-		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
-		cache->len = n;
+		goto cache_enqueue;
 	}
 
+	/* The request is too big for the cache flush algorithm? */
+	if (unlikely(n > cache->size / 2))
+		goto driver_enqueue;
+
+	/*
+	 * Flush a (CPU cache line size aligned, if mempool cache size allows)
+	 * bulk of objects to the backend, so ca. 1/2 cache size will remain
+	 * after adding the objects to the cache.
+	 */
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(cache->len + n - cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = cache->len + n - cache->size / 2;
+	cache->len -= len;
+	cache_objs = &cache->objs[cache->len];
+	cache->len += n;
+	rte_mempool_ops_enqueue_bulk(mp, cache_objs, len);
+
+cache_enqueue:
+
 	/* Add the objects to the cache. */
 	rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);
 
@@ -1415,13 +1431,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
-	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
-
-driver_enqueue_stats_incremented:
-
-	/* push objects to the backend */
+	/* Push the objects to the backend. */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
 
@@ -1440,7 +1450,7 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
  */
 static __rte_always_inline void
 rte_mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
-			unsigned int n, struct rte_mempool_cache *cache)
+			unsigned int n, struct rte_mempool_cache * const cache)
 {
 	rte_mempool_trace_generic_put(mp, obj_table, n, cache);
 	RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 0);
@@ -1465,8 +1475,7 @@ static __rte_always_inline void
 rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 		     unsigned int n)
 {
-	struct rte_mempool_cache *cache;
-	cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	struct rte_mempool_cache * const cache = rte_mempool_default_cache(mp, rte_lcore_id());
 	rte_mempool_trace_put_bulk(mp, obj_table, n, cache);
 	rte_mempool_generic_put(mp, obj_table, n, cache);
 }
@@ -1505,31 +1514,50 @@ rte_mempool_put(struct rte_mempool *mp, void *obj)
  */
 static __rte_always_inline int
 rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
-			   unsigned int n, struct rte_mempool_cache *cache)
+			   unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	unsigned int remaining;
 	uint32_t index, len;
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL)) {
 		remaining = n;
 		goto driver_dequeue;
 	}
 
+	/* The request itself is too big for cache storage? */
+	if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE)) {
+		remaining = n;
+		goto driver_dequeue;
+	}
+
 	/* The cache is a stack, so copy will be in reverse order. */
-	cache_objs = &cache->objs[cache->len];
+	len = cache->len;
+
+	/* The entire request can be served from the cache? */
+	if (n <= len) {
+		if (__rte_constant(n)) {
+			/*
+			 * The request size 'n' is known at build time,
+			 * so let the compiler unroll the fixed length copy loop.
+			 */
+			cache_objs = &cache->objs[len];
+			cache->len = len - n;
+			for (index = 0; index < n; index++)
+				*obj_table++ = *--cache_objs;
+		} else {
+			remaining = n;
 
-	if (__rte_constant(n) && n <= cache->len) {
-		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
-		 */
-		cache->len -= n;
-		for (index = 0; index < n; index++)
-			*obj_table++ = *--cache_objs;
+cache_dequeue:
+
+			/* Serve the remaining part of the request from the cache. */
+			cache_objs = &cache->objs[len];
+			cache->len = len - remaining;
+			for (index = 0; index < remaining; index++)
+				*obj_table++ = *--cache_objs;
+		}
 
 		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
@@ -1537,59 +1565,36 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		return 0;
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
+	/* Serve the first part of the request from the cache to return hot objects first. */
+	cache_objs = &cache->objs[len];
 	remaining = n - len;
 	for (index = 0; index < len; index++)
 		*obj_table++ = *--cache_objs;
 
+	/* At this point, the cache is empty. */
+
 	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
+	 * Fill the cache from the backend;
+	 * fetch the remaining part of the request + ca. 1/2 cache size.
+	 * Round down to a CPU cache line size aligned bulk, if mempool cache size allows.
 	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
+	if (likely(cache->size >= 2 * RTE_CACHE_LINE_SIZE / sizeof(void *)))
+		len = RTE_ALIGN_FLOOR(remaining + cache->size / 2,
+				RTE_CACHE_LINE_SIZE / sizeof(void *));
+	else
+		len = remaining + cache->size / 2;
+	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, len);
+	if (likely(ret == 0)) {
+		/* Serve the remaining part of the request from the cache. */
+		goto cache_dequeue;
 	}
 
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
-		goto driver_dequeue;
-
-	/* Fill the cache from the backend; fetch size + remaining objects. */
-	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
-	if (unlikely(ret < 0)) {
-		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
-		 * Do not fill the cache, just satisfy the remaining part of
-		 * the request directly from the backend.
-		 */
-		goto driver_dequeue;
-	}
-
-	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
-	for (index = 0; index < remaining; index++)
-		*obj_table++ = *--cache_objs;
-
-	cache->len = cache->size;
-
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-	return 0;
+	/*
+	 * We are buffer constrained, and not able to fetch
+	 * cache + remaining.
+	 * Do not fill the cache, just serve the remaining part of
+	 * the request directly from the backend.
+	 */
 
 driver_dequeue:
 
@@ -1597,7 +1602,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
 
 	if (ret < 0) {
-		if (likely(cache != NULL)) {
+		/* The first part of the request was served from the cache? */
+		if (likely(cache != NULL) && remaining != n) {
 			cache->len = n - remaining;
 			/*
 			 * No further action is required to roll the first part
@@ -1643,7 +1649,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
  */
 static __rte_always_inline int
 rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
-			unsigned int n, struct rte_mempool_cache *cache)
+			unsigned int n, struct rte_mempool_cache * const cache)
 {
 	int ret;
 	ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
@@ -1678,8 +1684,7 @@ rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 static __rte_always_inline int
 rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
-	struct rte_mempool_cache *cache;
-	cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	struct rte_mempool_cache * const cache = rte_mempool_default_cache(mp, rte_lcore_id());
 	rte_mempool_trace_get_bulk(mp, obj_table, n, cache);
 	return rte_mempool_generic_get(mp, obj_table, n, cache);
 }
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v18] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (15 preceding siblings ...)
  2024-10-02 15:00 ` [RFC PATCH v17] " Morten Brørup
@ 2025-02-21 15:13 ` Morten Brørup
  2025-02-21 19:05 ` [RFC PATCH v19] " Morten Brørup
  2025-02-21 20:27 ` [RFC PATCH v20] " Morten Brørup
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2025-02-21 15:13 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

NOTE: THIS VERSION DOES NOT BREAK THE API/ABI.

First, a per-lcore mempool cache could hold 50 % more than the cache's
size.
Since application developers do not expect this behavior, it could lead to
application failure.
This patch fixes this bug without breaking the API/ABI, by using the
mempool cache's "size" instead of the "flushthresh" as the threshold for
how many objects can be held in a mempool cache.
Note: The "flushthresh" field can be removed from the cache structure in a
future API/ABI breaking release, which must be announced in advance.

Second, requests to fetch a number of objects from the backend driver
exceeding the cache's size (but less than RTE_MEMPOOL_CACHE_MAX_SIZE) were
copied twice; first to the cache, and from there to the destination.
Such superfluous copying through the mempool cache degrades the
performance in these cases.
This patch also fixes this misbehavior, so when fetching more objects from
the driver than the mempool cache's size, they are fetched directly to the
destination.

The internal macro to calculate the cache flush threshold was updated to
reflect the new flush threshold of 1 * size instead of 1.5 * size.

The function rte_mempool_do_generic_put() for adding objects to a mempool
was modified as follows:
- When determining if the cache has sufficient room for the request
  without flushing, compare to the cache's size (cache->size) instead of
  the obsolete flush threshold (cache->flushthresh).
- The comparison for the request being too big, which is considered
  unlikely, was moved down and out of the code path where the cache has
  sufficient room for the added objects, which is considered the most
  likely code path.

The function rte_mempool_do_generic_get() for getting objects from a
mempool was refactored as follows:
- Handling a request for a constant number of objects was merged with
  handling a request for a nonconstant number of objects, and a note about
  compiler loop unrolling in the constant case was added.
- When determining if the remaining part of a request to be dequeued from
  the backend is too big to be copied via the cache, compare to the
  cache's size (cache->size) instead of the max possible cache size
  (RTE_MEMPOOL_CACHE_MAX_SIZE).
- When refilling the cache, the target fill level was reduced from the
  full cache size to half the cache size. This allows some room for a
  put() request following a get() request where the cache was refilled,
  without "flapping" between draining and refilling the entire cache.
  Note: Before this patch, the distance between the flush threshold and
  the refill level was also half a cache size.
- A copy of cache->len in the local variable "len" is no longer needed,
  so it was removed.

Furthermore, some likely()/unlikely()'s were added to a few inline
functions; most prominently rte_mempool_default_cache(), which is used by
both rte_mempool_put_bulk() and rte_mempool_get_bulk().

And finally, some comments were updated.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v18:
* Start over from scratch, to avoid API/ABI breakage.
v17:
* Update rearm in idpf driver.
v16:
* Fix bug in rte_mempool_do_generic_put() regarding criteria for flush.
v15:
* Changed back cache bypass limit from n >= RTE_MEMPOOL_CACHE_MAX_SIZE to
  n > RTE_MEMPOOL_CACHE_MAX_SIZE.
* Removed cache size limit from serving via cache.
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 lib/mempool/rte_mempool.c |  5 +-
 lib/mempool/rte_mempool.h | 98 +++++++++++++++------------------------
 2 files changed, 40 insertions(+), 63 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 1e4f24783c..cddc896442 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,10 +50,9 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
+/* Note: This is no longer 1.5 * size, but simply 1 * size.
  */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
+#define CALC_CACHE_FLUSHTHRESH(c) (c)
 
 #if defined(RTE_ARCH_X86)
 /*
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index c495cc012f..1200301ae9 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -791,7 +791,7 @@ rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
 	rte_mempool_trace_ops_dequeue_bulk(mp, obj_table, n);
 	ops = rte_mempool_get_ops(mp->ops_index);
 	ret = ops->dequeue(mp, obj_table, n);
-	if (ret == 0) {
+	if (likely(ret == 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_objs, n);
 	}
@@ -1044,7 +1044,7 @@ rte_mempool_free(struct rte_mempool *mp);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1333,10 +1333,10 @@ rte_mempool_cache_free(struct rte_mempool_cache *cache);
 static __rte_always_inline struct rte_mempool_cache *
 rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id)
 {
-	if (mp->cache_size == 0)
+	if (unlikely(mp->cache_size == 0))
 		return NULL;
 
-	if (lcore_id >= RTE_MAX_LCORE)
+	if (unlikely(lcore_id >= RTE_MAX_LCORE))
 		return NULL;
 
 	rte_mempool_trace_default_cache(mp, lcore_id,
@@ -1383,32 +1383,30 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
+	if (likely(cache->len + n <= cache->size)) {
+		/* Sufficient room in the cache for the objects. */
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
+	} else if (n <= cache->size) {
+		/*
+		 * The cache is big enough for the objects, but - as detected by
+		 * the comparison above - has insufficient room for them.
+		 * Flush the cache to make room for the objects.
+		 */
 		cache_objs = &cache->objs[0];
 		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
 		cache->len = n;
+	} else {
+		/* The request itself is too big for the cache. */
+		goto driver_enqueue_stats_incremented;
 	}
 
 	/* Add the objects to the cache. */
@@ -1512,10 +1510,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	unsigned int remaining;
-	uint32_t index, len;
+	uint32_t index;
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL)) {
 		remaining = n;
 		goto driver_dequeue;
@@ -1524,11 +1522,11 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	/* The cache is a stack, so copy will be in reverse order. */
 	cache_objs = &cache->objs[cache->len];
 
-	if (__rte_constant(n) && n <= cache->len) {
+	if (likely(n <= cache->len)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * The entire request can be satisfied from the cache.
+		 * Note: If the request size is known at build time,
+		 * the compiler will unroll the fixed length copy loop.
 		 */
 		cache->len -= n;
 		for (index = 0; index < n; index++)
@@ -1540,55 +1538,35 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		return 0;
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
+	/* Use the cache as much as we have to return hot objects first. */
+	for (index = 0; index < cache->len; index++)
 		*obj_table++ = *--cache_objs;
+	remaining = n - cache->len;
+	cache->len = 0;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
+	/* The remaining request is too big for the cache? */
+	if (unlikely(remaining > cache->size))
 		goto driver_dequeue;
 
-	/* Fill the cache from the backend; fetch size + remaining objects. */
+	/* Fill the cache from the backend; fetch remaining objects + size / 2. */
 	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+			remaining + cache->size / 2);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
+		 * We are buffer constrained, and not able to fetch all that.
 		 * Do not fill the cache, just satisfy the remaining part of
 		 * the request directly from the backend.
 		 */
 		goto driver_dequeue;
 	}
 
+	cache->len = cache->size / 2;
+
 	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	cache_objs = &cache->objs[cache->len + remaining];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
-
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
@@ -1599,7 +1577,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	/* Get remaining objects directly from the backend. */
 	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
 
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		if (likely(cache != NULL)) {
 			cache->len = n - remaining;
 			/*
@@ -1650,7 +1628,7 @@ rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
-	if (ret == 0)
+	if (likely(ret == 0))
 		RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 1);
 	rte_mempool_trace_generic_get(mp, obj_table, n, cache);
 	return ret;
@@ -1741,7 +1719,7 @@ rte_mempool_get_contig_blocks(struct rte_mempool *mp,
 	int ret;
 
 	ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table, n);
-	if (ret == 0) {
+	if (likely(ret == 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_success_blks, n);
 		RTE_MEMPOOL_CONTIG_BLOCKS_CHECK_COOKIES(mp, first_obj_table, n,
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v19] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (16 preceding siblings ...)
  2025-02-21 15:13 ` [RFC PATCH v18] " Morten Brørup
@ 2025-02-21 19:05 ` Morten Brørup
  2025-02-21 20:27 ` [RFC PATCH v20] " Morten Brørup
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2025-02-21 19:05 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

NOTE: THIS VERSION DOES NOT BREAK THE API/ABI.

First, a per-lcore mempool cache could hold 50 % more than the cache's
size.
Since application developers do not expect this behavior, it could lead to
application failure.
This patch fixes this bug without breaking the API/ABI, by using the
mempool cache's "size" instead of the "flushthresh" as the threshold for
how many objects can be held in a mempool cache.
Note: The "flushthresh" field can be removed from the cache structure in a
future API/ABI breaking release, which must be announced in advance.

Second, requests to fetch a number of objects from the backend driver
exceeding the cache's size (but less than RTE_MEMPOOL_CACHE_MAX_SIZE) were
copied twice; first to the cache, and from there to the destination.
Such superfluous copying through the mempool cache degrades the
performance in these cases.
This patch also fixes this misbehavior, so when fetching more objects from
the driver than the mempool cache's size, they are fetched directly to the
destination.

The internal macro to calculate the cache flush threshold was updated to
reflect the new flush threshold of 1 * size instead of 1.5 * size.

The function rte_mempool_do_generic_put() for adding objects to a mempool
was modified as follows:
- When determining if the cache has sufficient room for the request
  without flushing, compare to the cache's size (cache->size) instead of
  the obsolete flush threshold (cache->flushthresh).
- The comparison for the request being too big, which is considered
  unlikely, was moved down and out of the code path where the cache has
  sufficient room for the added objects, which is considered the most
  likely code path.
- Added __rte_assume() about the cache size, for compiler optimization
  when "n" is compile time constant.
- Added __rte_assume() about "ret", for compiler optimization of
  rte_mempool_generic_get() considering the return value of
  rte_mempool_do_generic_get().

The function rte_mempool_do_generic_get() for getting objects from a
mempool was refactored as follows:
- Handling a request for a constant number of objects was merged with
  handling a request for a nonconstant number of objects, and a note about
  compiler loop unrolling in the constant case was added.
- When determining if the remaining part of a request to be dequeued from
  the backend is too big to be copied via the cache, compare to the
  cache's size (cache->size) instead of the max possible cache size
  (RTE_MEMPOOL_CACHE_MAX_SIZE).
- When refilling the cache, the target fill level was reduced from the
  full cache size to half the cache size. This allows some room for a
  put() request following a get() request where the cache was refilled,
  without "flapping" between draining and refilling the entire cache.
  Note: Before this patch, the distance between the flush threshold and
  the refill level was also half a cache size.
- A copy of cache->len in the local variable "len" is no longer needed,
  so it was removed.
- Added a group of __rte_assume()'s, for compiler optimization when "n" is
  compile time constant.

Some comments were also updated.

Furthermore, some likely()/unlikely()'s were added to a few inline
functions; most prominently rte_mempool_default_cache(), which is used by
both rte_mempool_put_bulk() and rte_mempool_get_bulk().

And finally, RTE_ASSERT()'s were added to check the return values of the
mempool driver dequeue() and enqueue() operations.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v19:
* Added __rte_assume()'s and RTE_ASSERT()'s.
v18:
* Start over from scratch, to avoid API/ABI breakage.
v17:
* Update rearm in idpf driver.
v16:
* Fix bug in rte_mempool_do_generic_put() regarding criteria for flush.
v15:
* Changed back cache bypass limit from n >= RTE_MEMPOOL_CACHE_MAX_SIZE to
  n > RTE_MEMPOOL_CACHE_MAX_SIZE.
* Removed cache size limit from serving via cache.
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 lib/mempool/rte_mempool.c |   5 +-
 lib/mempool/rte_mempool.h | 105 ++++++++++++++++----------------------
 2 files changed, 47 insertions(+), 63 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 1e4f24783c..cddc896442 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,10 +50,9 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
+/* Note: This is no longer 1.5 * size, but simply 1 * size.
  */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
+#define CALC_CACHE_FLUSHTHRESH(c) (c)
 
 #if defined(RTE_ARCH_X86)
 /*
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index c495cc012f..7742677c01 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -791,7 +791,8 @@ rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
 	rte_mempool_trace_ops_dequeue_bulk(mp, obj_table, n);
 	ops = rte_mempool_get_ops(mp->ops_index);
 	ret = ops->dequeue(mp, obj_table, n);
-	if (ret == 0) {
+	RTE_ASSERT(ret <= 0);
+	if (likely(ret == 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_objs, n);
 	}
@@ -848,6 +849,7 @@ rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
 	rte_mempool_trace_ops_enqueue_bulk(mp, obj_table, n);
 	ops = rte_mempool_get_ops(mp->ops_index);
 	ret = ops->enqueue(mp, obj_table, n);
+	RTE_ASSERT(ret <= 0);
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 	if (unlikely(ret < 0))
 		RTE_MEMPOOL_LOG(CRIT, "cannot enqueue %u objects to mempool %s",
@@ -1044,7 +1046,7 @@ rte_mempool_free(struct rte_mempool *mp);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1333,10 +1335,10 @@ rte_mempool_cache_free(struct rte_mempool_cache *cache);
 static __rte_always_inline struct rte_mempool_cache *
 rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id)
 {
-	if (mp->cache_size == 0)
+	if (unlikely(mp->cache_size == 0))
 		return NULL;
 
-	if (lcore_id >= RTE_MAX_LCORE)
+	if (unlikely(lcore_id >= RTE_MAX_LCORE))
 		return NULL;
 
 	rte_mempool_trace_default_cache(mp, lcore_id,
@@ -1383,32 +1385,33 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
+	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	__rte_assume(cache->len <= cache->size);
+	if (likely(cache->len + n <= cache->size)) {
+		/* Sufficient room in the cache for the objects. */
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
+	} else if (n <= cache->size) {
+		/*
+		 * The cache is big enough for the objects, but - as detected by
+		 * the comparison above - has insufficient room for them.
+		 * Flush the cache to make room for the objects.
+		 */
 		cache_objs = &cache->objs[0];
 		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
 		cache->len = n;
+	} else {
+		/* The request itself is too big for the cache. */
+		goto driver_enqueue_stats_incremented;
 	}
 
 	/* Add the objects to the cache. */
@@ -1512,10 +1515,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	unsigned int remaining;
-	uint32_t index, len;
+	uint32_t index;
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL)) {
 		remaining = n;
 		goto driver_dequeue;
@@ -1524,11 +1527,12 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	/* The cache is a stack, so copy will be in reverse order. */
 	cache_objs = &cache->objs[cache->len];
 
-	if (__rte_constant(n) && n <= cache->len) {
+	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	if (likely(n <= cache->len)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * The entire request can be satisfied from the cache.
+		 * Note: If the request size is known at build time,
+		 * the compiler will unroll the fixed length copy loop.
 		 */
 		cache->len -= n;
 		for (index = 0; index < n; index++)
@@ -1540,55 +1544,35 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		return 0;
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
+	/* Use the cache as much as we have to return hot objects first. */
+	for (index = 0; index < cache->len; index++)
 		*obj_table++ = *--cache_objs;
+	remaining = n - cache->len;
+	cache->len = 0;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
+	/* The remaining request is too big for the cache? */
+	if (unlikely(remaining > cache->size))
 		goto driver_dequeue;
 
-	/* Fill the cache from the backend; fetch size + remaining objects. */
+	/* Fill the cache from the backend; fetch size / 2 + remaining objects. */
 	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+			cache->size / 2 + remaining);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
+		 * We are buffer constrained, and not able to fetch all that.
 		 * Do not fill the cache, just satisfy the remaining part of
 		 * the request directly from the backend.
 		 */
 		goto driver_dequeue;
 	}
 
+	cache->len = cache->size / 2;
+
 	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	cache_objs = &cache->objs[cache->len + remaining];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
-
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
@@ -1599,7 +1583,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	/* Get remaining objects directly from the backend. */
 	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
 
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		if (likely(cache != NULL)) {
 			cache->len = n - remaining;
 			/*
@@ -1619,6 +1603,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
+		__rte_assume(ret == 0);
 	}
 
 	return ret;
@@ -1650,7 +1635,7 @@ rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
-	if (ret == 0)
+	if (likely(ret == 0))
 		RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 1);
 	rte_mempool_trace_generic_get(mp, obj_table, n, cache);
 	return ret;
@@ -1741,7 +1726,7 @@ rte_mempool_get_contig_blocks(struct rte_mempool *mp,
 	int ret;
 
 	ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table, n);
-	if (ret == 0) {
+	if (likely(ret == 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_success_blks, n);
 		RTE_MEMPOOL_CONTIG_BLOCKS_CHECK_COOKIES(mp, first_obj_table, n,
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC PATCH v20] mempool: fix mempool cache size
  2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
                   ` (17 preceding siblings ...)
  2025-02-21 19:05 ` [RFC PATCH v19] " Morten Brørup
@ 2025-02-21 20:27 ` Morten Brørup
  18 siblings, 0 replies; 22+ messages in thread
From: Morten Brørup @ 2025-02-21 20:27 UTC (permalink / raw)
  To: dev; +Cc: Morten Brørup

NOTE: THIS VERSION DOES NOT BREAK THE API/ABI.

First, a per-lcore mempool cache could hold 50 % more than the cache's
size.
Since application developers do not expect this behavior, it could lead to
application failure.
This patch fixes this bug without breaking the API/ABI, by using the
mempool cache's "size" instead of the "flushthresh" as the threshold for
how many objects can be held in a mempool cache.
Note: The "flushthresh" field can be removed from the cache structure in a
future API/ABI breaking release, which must be announced in advance.

Second, requests to fetch a number of objects from the backend driver
exceeding the cache's size (but less than RTE_MEMPOOL_CACHE_MAX_SIZE) were
copied twice; first to the cache, and from there to the destination.
Such superfluous copying through the mempool cache degrades the
performance in these cases.
This patch also fixes this misbehavior, so when fetching more objects from
the driver than the mempool cache's size, they are fetched directly to the
destination.

The internal macro to calculate the cache flush threshold was updated to
reflect the new flush threshold of 1 * size instead of 1.5 * size.

The function rte_mempool_do_generic_put() for adding objects to a mempool
was modified as follows:
- When determining if the cache has sufficient room for the request
  without flushing, compare to the cache's size (cache->size) instead of
  the obsolete flush threshold (cache->flushthresh).
- The comparison for the request being too big, which is considered
  unlikely, was moved down and out of the code path where the cache has
  sufficient room for the added objects, which is considered the most
  likely code path.
- Added __rte_assume() about the cache size, for compiler optimization
  when "n" is compile time constant.
- Added __rte_assume() about "ret", for compiler optimization of
  rte_mempool_generic_get() considering the return value of
  rte_mempool_do_generic_get().

The function rte_mempool_do_generic_get() for getting objects from a
mempool was refactored as follows:
- Handling a request for a constant number of objects was merged with
  handling a request for a nonconstant number of objects, and a note about
  compiler loop unrolling in the constant case was added.
- When determining if the remaining part of a request to be dequeued from
  the backend is too big to be copied via the cache, compare to the
  cache's size (cache->size) instead of the max possible cache size
  (RTE_MEMPOOL_CACHE_MAX_SIZE).
- When refilling the cache, the target fill level was reduced from the
  full cache size to half the cache size. This allows some room for a
  put() request following a get() request where the cache was refilled,
  without "flapping" between draining and refilling the entire cache.
  Note: Before this patch, the distance between the flush threshold and
  the refill level was also half a cache size.
- A copy of cache->len in the local variable "len" is no longer needed,
  so it was removed.
- Added a group of __rte_assume()'s, for compiler optimization when "n" is
  compile time constant.

Some comments were also updated.

Furthermore, some likely()/unlikely()'s were added to a few inline
functions; most prominently rte_mempool_default_cache(), which is used by
both rte_mempool_put_bulk() and rte_mempool_get_bulk().

And finally, RTE_ASSERT()'s were added to check the return values of the
mempool driver dequeue() and enqueue() operations.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v20:
* Added more __rte_assume()'s to fix build error with GCC 11.4.1 and
  GCC 11.5.0 in call to mempool_get_bulk() with compile time constant "n"
  larger than RTE_MEMPOOL_CACHE_MAX_SIZE.
v19:
* Added __rte_assume()'s and RTE_ASSERT()'s.
v18:
* Start over from scratch, to avoid API/ABI breakage.
v17:
* Update rearm in idpf driver.
v16:
* Fix bug in rte_mempool_do_generic_put() regarding criteria for flush.
v15:
* Changed back cache bypass limit from n >= RTE_MEMPOOL_CACHE_MAX_SIZE to
  n > RTE_MEMPOOL_CACHE_MAX_SIZE.
* Removed cache size limit from serving via cache.
v14:
* Change rte_mempool_do_generic_put() back from add-then-flush to
  flush-then-add.
  Keep the target cache fill level of ca. 1/2 size of the cache.
v13:
* Target a cache fill level of ca. 1/2 size of the cache when flushing and
  refilling; based on an assumption of equal probability of get and put,
  instead of assuming a higher probability of put being followed by
  another put, and get being followed by another get.
* Reduce the amount of changes to the drivers.
v12:
* Do not init mempool caches with size zero; they don't exist.
  Bug introduced in v10.
v11:
* Removed rte_mempool_do_generic_get_split().
v10:
* Initialize mempool caches, regardless of size zero.
  This to fix compiler warning about out of bounds access.
v9:
* Removed factor 1.5 from description of cache_size parameter to
  rte_mempool_create().
* Refactored rte_mempool_do_generic_put() to eliminate some gotos.
  No functional change.
* Removed check for n >= RTE_MEMPOOL_CACHE_MAX_SIZE in
  rte_mempool_do_generic_get(); it caused the function to fail when the
  request could not be served from the backend alone, but it could be
  served from the cache and the backend.
* Refactored rte_mempool_do_generic_get_split() to make it shorter.
* When getting objects directly from the backend, use burst size aligned
  with either CPU cache line size or mempool cache size.
v8:
* Rewrote rte_mempool_do_generic_put() to get rid of transaction
  splitting. Use a method similar to the existing put method with fill
  followed by flush if overfilled.
  This also made rte_mempool_do_generic_put_split() obsolete.
* When flushing the cache as much as we can, use burst size aligned with
  either CPU cache line size or mempool cache size.
v7:
* Increased max mempool cache size from 512 to 1024 objects.
  Mainly for CI performance test purposes.
  Originally, the max mempool cache size was 768 objects, and used a fixed
  size array of 1024 objects in the mempool cache structure.
v6:
* Fix v5 incomplete implementation of passing large requests directly to
  the backend.
* Use memcpy instead of rte_memcpy where compiler complains about it.
* Added const to some function parameters.
v5:
* Moved helper functions back into the header file, for improved
  performance.
* Pass large requests directly to the backend. This also simplifies the
  code.
v4:
* Updated subject to reflect that misleading names are considered bugs.
* Rewrote patch description to provide more details about the bugs fixed.
  (Mattias Rönnblom)
* Moved helper functions, not to be inlined, to mempool C file.
  (Mattias Rönnblom)
* Pass requests for n >= RTE_MEMPOOL_CACHE_MAX_SIZE objects known at build
  time directly to backend driver, to avoid calling the helper functions.
  This also fixes the compiler warnings about out of bounds array access.
v3:
* Removed __attribute__(assume).
v2:
* Removed mempool perf test; not part of patch set.
---
 lib/mempool/rte_mempool.c |   5 +-
 lib/mempool/rte_mempool.h | 108 +++++++++++++++++---------------------
 2 files changed, 50 insertions(+), 63 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 1e4f24783c..cddc896442 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -50,10 +50,9 @@ static void
 mempool_event_callback_invoke(enum rte_mempool_event event,
 			      struct rte_mempool *mp);
 
-/* Note: avoid using floating point since that compiler
- * may not think that is constant.
+/* Note: This is no longer 1.5 * size, but simply 1 * size.
  */
-#define CALC_CACHE_FLUSHTHRESH(c) (((c) * 3) / 2)
+#define CALC_CACHE_FLUSHTHRESH(c) (c)
 
 #if defined(RTE_ARCH_X86)
 /*
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index c495cc012f..de1b41d899 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -791,7 +791,8 @@ rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
 	rte_mempool_trace_ops_dequeue_bulk(mp, obj_table, n);
 	ops = rte_mempool_get_ops(mp->ops_index);
 	ret = ops->dequeue(mp, obj_table, n);
-	if (ret == 0) {
+	RTE_ASSERT(ret <= 0);
+	if (likely(ret == 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_objs, n);
 	}
@@ -848,6 +849,7 @@ rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
 	rte_mempool_trace_ops_enqueue_bulk(mp, obj_table, n);
 	ops = rte_mempool_get_ops(mp->ops_index);
 	ret = ops->enqueue(mp, obj_table, n);
+	RTE_ASSERT(ret <= 0);
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 	if (unlikely(ret < 0))
 		RTE_MEMPOOL_LOG(CRIT, "cannot enqueue %u objects to mempool %s",
@@ -1044,7 +1046,7 @@ rte_mempool_free(struct rte_mempool *mp);
  *   If cache_size is non-zero, the rte_mempool library will try to
  *   limit the accesses to the common lockless pool, by maintaining a
  *   per-lcore object cache. This argument must be lower or equal to
- *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose
+ *   RTE_MEMPOOL_CACHE_MAX_SIZE and n. It is advised to choose
  *   cache_size to have "n modulo cache_size == 0": if this is
  *   not the case, some elements will always stay in the pool and will
  *   never be used. The access to the per-lcore table is of course
@@ -1333,10 +1335,10 @@ rte_mempool_cache_free(struct rte_mempool_cache *cache);
 static __rte_always_inline struct rte_mempool_cache *
 rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id)
 {
-	if (mp->cache_size == 0)
+	if (unlikely(mp->cache_size == 0))
 		return NULL;
 
-	if (lcore_id >= RTE_MAX_LCORE)
+	if (unlikely(lcore_id >= RTE_MAX_LCORE))
 		return NULL;
 
 	rte_mempool_trace_default_cache(mp, lcore_id,
@@ -1383,32 +1385,33 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL))
 		goto driver_enqueue;
 
-	/* increment stat now, adding in mempool always success */
+	/* Increment stats now, adding in mempool always succeeds. */
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* The request itself is too big for the cache */
-	if (unlikely(n > cache->flushthresh))
-		goto driver_enqueue_stats_incremented;
-
-	/*
-	 * The cache follows the following algorithm:
-	 *   1. If the objects cannot be added to the cache without crossing
-	 *      the flush threshold, flush the cache to the backend.
-	 *   2. Add the objects to the cache.
-	 */
-
-	if (cache->len + n <= cache->flushthresh) {
+	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	__rte_assume(cache->len <= cache->size);
+	if (likely(cache->len + n <= cache->size)) {
+		/* Sufficient room in the cache for the objects. */
 		cache_objs = &cache->objs[cache->len];
 		cache->len += n;
-	} else {
+	} else if (n <= cache->size) {
+		/*
+		 * The cache is big enough for the objects, but - as detected by
+		 * the comparison above - has insufficient room for them.
+		 * Flush the cache to make room for the objects.
+		 */
 		cache_objs = &cache->objs[0];
 		rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
 		cache->len = n;
+	} else {
+		/* The request itself is too big for the cache. */
+		goto driver_enqueue_stats_incremented;
 	}
 
 	/* Add the objects to the cache. */
@@ -1512,10 +1515,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	unsigned int remaining;
-	uint32_t index, len;
+	uint32_t index;
 	void **cache_objs;
 
-	/* No cache provided */
+	/* No cache provided? */
 	if (unlikely(cache == NULL)) {
 		remaining = n;
 		goto driver_dequeue;
@@ -1524,11 +1527,12 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	/* The cache is a stack, so copy will be in reverse order. */
 	cache_objs = &cache->objs[cache->len];
 
-	if (__rte_constant(n) && n <= cache->len) {
+	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	if (likely(n <= cache->len)) {
 		/*
-		 * The request size is known at build time, and
-		 * the entire request can be satisfied from the cache,
-		 * so let the compiler unroll the fixed length copy loop.
+		 * The entire request can be satisfied from the cache.
+		 * Note: If the request size is known at build time,
+		 * the compiler will unroll the fixed length copy loop.
 		 */
 		cache->len -= n;
 		for (index = 0; index < n; index++)
@@ -1540,55 +1544,38 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		return 0;
 	}
 
-	/*
-	 * Use the cache as much as we have to return hot objects first.
-	 * If the request size 'n' is known at build time, the above comparison
-	 * ensures that n > cache->len here, so omit RTE_MIN().
-	 */
-	len = __rte_constant(n) ? cache->len : RTE_MIN(n, cache->len);
-	cache->len -= len;
-	remaining = n - len;
-	for (index = 0; index < len; index++)
+	/* Use the cache as much as we have to return hot objects first. */
+	for (index = 0; index < cache->len; index++)
 		*obj_table++ = *--cache_objs;
+	remaining = n - cache->len;
+	cache->len = 0;
 
-	/*
-	 * If the request size 'n' is known at build time, the case
-	 * where the entire request can be satisfied from the cache
-	 * has already been handled above, so omit handling it here.
-	 */
-	if (!__rte_constant(n) && remaining == 0) {
-		/* The entire request is satisfied from the cache. */
-
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
-		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
-
-		return 0;
-	}
-
-	/* if dequeue below would overflow mem allocated for cache */
-	if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
+	/* The remaining request is too big for the cache? */
+	__rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	if (unlikely(remaining > cache->size))
 		goto driver_dequeue;
 
-	/* Fill the cache from the backend; fetch size + remaining objects. */
+	/* Fill the cache from the backend; fetch size / 2 + remaining objects. */
 	ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
-			cache->size + remaining);
+			cache->size / 2 + remaining);
 	if (unlikely(ret < 0)) {
 		/*
-		 * We are buffer constrained, and not able to allocate
-		 * cache + remaining.
+		 * We are buffer constrained, and not able to fetch all that.
 		 * Do not fill the cache, just satisfy the remaining part of
 		 * the request directly from the backend.
 		 */
 		goto driver_dequeue;
 	}
 
+	cache->len = cache->size / 2;
+
 	/* Satisfy the remaining part of the request from the filled cache. */
-	cache_objs = &cache->objs[cache->size + remaining];
+	__rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
+	__rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
+	cache_objs = &cache->objs[cache->len + remaining];
 	for (index = 0; index < remaining; index++)
 		*obj_table++ = *--cache_objs;
 
-	cache->len = cache->size;
-
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
 	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
@@ -1599,7 +1586,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	/* Get remaining objects directly from the backend. */
 	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
 
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		if (likely(cache != NULL)) {
 			cache->len = n - remaining;
 			/*
@@ -1619,6 +1606,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
 		}
+		__rte_assume(ret == 0);
 	}
 
 	return ret;
@@ -1650,7 +1638,7 @@ rte_mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 {
 	int ret;
 	ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
-	if (ret == 0)
+	if (likely(ret == 0))
 		RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 1);
 	rte_mempool_trace_generic_get(mp, obj_table, n, cache);
 	return ret;
@@ -1741,7 +1729,7 @@ rte_mempool_get_contig_blocks(struct rte_mempool *mp,
 	int ret;
 
 	ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table, n);
-	if (ret == 0) {
+	if (likely(ret == 0)) {
 		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_success_blks, n);
 		RTE_MEMPOOL_CONTIG_BLOCKS_CHECK_COOKIES(mp, first_obj_table, n,
-- 
2.43.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2025-02-21 20:27 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-09-20 16:32 [RFC PATCH] mempool: obey configured cache size Morten Brørup
2024-09-20 16:37 ` [RFC PATCH v2] " Morten Brørup
2024-09-20 17:13 ` [RFC PATCH v3] " Morten Brørup
2024-09-20 19:41   ` Mattias Rönnblom
2024-09-22 10:50 ` [RFC PATCH v4] mempool: fix mempool " Morten Brørup
2024-09-24  3:58 ` [RFC PATCH v5] " Morten Brørup
2024-09-24 11:58 ` [RFC PATCH v6] " Morten Brørup
2024-09-24 18:12 ` [RFC PATCH v7] " Morten Brørup
2024-09-24 20:44   ` Patrick Robb
2024-09-25 21:33 ` [RFC PATCH v8] " Morten Brørup
2024-09-26 18:24 ` [RFC PATCH v9] " Morten Brørup
2024-09-26 20:53 ` [RFC PATCH v10] " Morten Brørup
2024-09-28 17:32 ` [RFC PATCH v11] " Morten Brørup
2024-09-28 19:38 ` [RFC PATCH v12] " Morten Brørup
2024-10-01 11:33 ` [RFC PATCH v13] " Morten Brørup
2024-10-01 13:41 ` [RFC PATCH v14] " Morten Brørup
2024-10-02 11:25 ` [RFC PATCH v15] " Morten Brørup
2024-10-02 14:35 ` [RFC PATCH v16] " Morten Brørup
2024-10-02 15:00 ` [RFC PATCH v17] " Morten Brørup
2025-02-21 15:13 ` [RFC PATCH v18] " Morten Brørup
2025-02-21 19:05 ` [RFC PATCH v19] " Morten Brørup
2025-02-21 20:27 ` [RFC PATCH v20] " Morten Brørup

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).