From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 8E749A0542;
	Fri, 28 Oct 2022 08:35:16 +0200 (CEST)
Received: from [217.70.189.124] (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id 9BFDF40A89;
	Fri, 28 Oct 2022 08:35:11 +0200 (CEST)
Received: from smartserver.smartsharesystems.com
 (smartserver.smartsharesystems.com [77.243.40.215])
 by mails.dpdk.org (Postfix) with ESMTP id 5B0E2400D5
 for <dev@dpdk.org>; Fri, 28 Oct 2022 08:35:09 +0200 (CEST)
Received: from dkrd2.smartsharesys.local ([192.168.4.12]) by
 smartserver.smartsharesystems.com with Microsoft SMTPSVC(6.0.3790.4675); 
 Fri, 28 Oct 2022 08:35:08 +0200
From: =?UTF-8?q?Morten=20Br=C3=B8rup?= <mb@smartsharesystems.com>
To: olivier.matz@6wind.com,
	andrew.rybchenko@oktetlabs.ru
Cc: jerinj@marvell.com, thomas@monjalon.net, bruce.richardson@intel.com,
 dev@dpdk.org, =?UTF-8?q?Morten=20Br=C3=B8rup?= <mb@smartsharesystems.com>
Subject: [PATCH v3 2/2] mempool: optimized debug statistics
Date: Fri, 28 Oct 2022 08:35:04 +0200
Message-Id: <20221028063504.98184-2-mb@smartsharesystems.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20221028063504.98184-1-mb@smartsharesystems.com>
References: <20221026144436.71068-1-mb@smartsharesystems.com>
 <20221028063504.98184-1-mb@smartsharesystems.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-OriginalArrivalTime: 28 Oct 2022 06:35:08.0023 (UTC)
 FILETIME=[6BFE7870:01D8EA97]
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

When built with debug enabled (RTE_LIBRTE_MEMPOOL_DEBUG defined), the
performance of mempools with caches is improved as follows.

Accessing objects in the mempool is likely to increment either the
put_bulk and put_objs or the get_success_bulk and get_success_objs
debug statistics counters.

By adding an alternative set of these counters to the mempool cache
structure, accesing the dedicated debug statistics structure is avoided in
the likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to update the "len" field. Updating some
debug statistics counters in the same cache line has lower performance
cost than accessing the debug statistics counters in the dedicated debug
statistics structure, i.e. in another cache line.

Running mempool_perf_autotest on a VMware virtual server shows an avg.
increase of 6.4 % in rate_persec for the tests with cache. (Only when
built with with debug enabled, obviously!)

For the tests without cache, the avg. increase in rate_persec is 0.8 %. I
assume this is noise from the test environment.

v3:
* Try to fix git reference by making part of a series.
* Add --in-reply-to v1 when sending email.
v2:
* Fix spelling and repeated word in commit message, caught by checkpatch.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  7 +++++
 lib/mempool/rte_mempool.h | 55 +++++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..7b8c00a022 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1285,6 +1285,13 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs;
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
+		/* Add the fast access statistics, if local caches exist */
+		if (mp->cache_size != 0) {
+			sum.put_bulk += mp->local_cache[lcore_id].put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].get_success_objs;
+		}
 	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..d84087bc92 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,14 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	uint32_t unused;
+	/* Fast access statistics, only for likely events */
+	uint64_t put_bulk;             /**< Number of puts. */
+	uint64_t put_objs;             /**< Number of objects successfully put. */
+	uint64_t get_success_bulk;     /**< Successful allocation number. */
+	uint64_t get_success_objs;     /**< Objects successfully allocated. */
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -1327,13 +1335,19 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	cache->put_bulk += 1;
+	cache->put_objs += n;
+#endif
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1358,6 +1372,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1464,8 +1484,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		cache->get_success_bulk += 1;
+		cache->get_success_objs += n;
+#endif
 
 		return 0;
 	}
@@ -1494,8 +1516,10 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	cache->get_success_bulk += 1;
+	cache->get_success_objs += n;
+#endif
 
 	return 0;
 
@@ -1517,8 +1541,17 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (likely(cache != NULL)) {
+			cache->get_success_bulk += 1;
+			cache->get_success_bulk += n;
+		} else {
+#endif
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, n);
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		}
+#endif
 	}
 
 	return ret;
-- 
2.17.1