DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] mempool: split statistics from debug
@ 2022-10-30 11:54 Morten Brørup
  2022-10-30 14:04 ` Morten Brørup
  2022-10-31 11:26 ` [PATCH v2 1/3] " Morten Brørup
  0 siblings, 2 replies; 39+ messages in thread
From: Morten Brørup @ 2022-10-30 11:54 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko; +Cc: dev, Morten Brørup

Split statistics from debug, to make mempool statistics available without
the performance cost of continuously validating the cookies in the mempool
elements.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 config/rte_config.h       |  2 ++
 lib/mempool/rte_mempool.c | 10 +++++-----
 lib/mempool/rte_mempool.h | 14 +++++++-------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index ae56a86394..1b12d30557 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -47,6 +47,8 @@
 
 /* mempool defines */
 #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+// RTE_LIBRTE_MEMPOOL_STATS is not set
+// RTE_LIBRTE_MEMPOOL_DEBUG is not set
 
 /* mbuf defines */
 #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..f180257a54 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -818,8 +818,8 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
@@ -1221,9 +1221,9 @@ rte_mempool_audit(struct rte_mempool *mp)
 void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	struct rte_mempool_info info;
-	struct rte_mempool_debug_stats sum;
+	struct rte_mempool_stats sum;
 	unsigned lcore_id;
 #endif
 	struct rte_mempool_memhdr *memhdr;
@@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "  common_pool_count=%u\n", common_count);
 
 	/* sum and dump statistics */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..89640e48e5 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -56,14 +56,14 @@ extern "C" {
 #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header cookie. */
 #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer cookie.*/
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 /**
  * A structure that stores the mempool statistics (per-lcore).
  * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are not
  * captured since they can be calculated from other stats.
  * For example: put_cache_objs = put_objs - put_common_pool_objs.
  */
-struct rte_mempool_debug_stats {
+struct rte_mempool_stats {
 	uint64_t put_bulk;             /**< Number of puts. */
 	uint64_t put_objs;             /**< Number of objects successfully put. */
 	uint64_t put_common_pool_bulk; /**< Number of bulks enqueued in common pool. */
@@ -237,9 +237,9 @@ struct rte_mempool {
 	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	struct rte_mempool_stats stats[RTE_MAX_LCORE];
 #endif
 }  __rte_cache_aligned;
 
@@ -293,16 +293,16 @@ struct rte_mempool {
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats are enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
 		unsigned __lcore_id = rte_lcore_id();           \
 		if (__lcore_id < RTE_MAX_LCORE) {               \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH] mempool: split statistics from debug
  2022-10-30 11:54 [PATCH] mempool: split statistics from debug Morten Brørup
@ 2022-10-30 14:04 ` Morten Brørup
  2022-10-30 16:12   ` Stephen Hemminger
  2022-10-31 11:26 ` [PATCH v2 1/3] " Morten Brørup
  1 sibling, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-10-30 14:04 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko; +Cc: dev

> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Sunday, 30 October 2022 12.55
> 
> Split statistics from debug, to make mempool statistics available
> without
> the performance cost of continuously validating the cookies in the
> mempool
> elements.

mempool_perf_autotest shows that the rate_persec drops to a third (-66 %) when enabling full mempool debug - quite prohibitive!

With this patch, the performance cost is much lower. I don't have detailed test results, but the initial tests without mempool cache show a performance drop of -11 %. Significant, but not prohibitive.

> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  config/rte_config.h       |  2 ++
>  lib/mempool/rte_mempool.c | 10 +++++-----
>  lib/mempool/rte_mempool.h | 14 +++++++-------
>  3 files changed, 14 insertions(+), 12 deletions(-)
> 
> diff --git a/config/rte_config.h b/config/rte_config.h
> index ae56a86394..1b12d30557 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -47,6 +47,8 @@
> 
>  /* mempool defines */
>  #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
> +// RTE_LIBRTE_MEMPOOL_STATS is not set
> +// RTE_LIBRTE_MEMPOOL_DEBUG is not set
> 
>  /* mbuf defines */
>  #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 21c94a2b9f..f180257a54 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -818,8 +818,8 @@ rte_mempool_create_empty(const char *name, unsigned
> n, unsigned elt_size,
>  			  RTE_CACHE_LINE_MASK) != 0);
>  	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
>  			  RTE_CACHE_LINE_MASK) != 0);
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> -	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_stats) &
>  			  RTE_CACHE_LINE_MASK) != 0);
>  	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
>  			  RTE_CACHE_LINE_MASK) != 0);
> @@ -1221,9 +1221,9 @@ rte_mempool_audit(struct rte_mempool *mp)
>  void
>  rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>  {
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  	struct rte_mempool_info info;
> -	struct rte_mempool_debug_stats sum;
> +	struct rte_mempool_stats sum;
>  	unsigned lcore_id;
>  #endif
>  	struct rte_mempool_memhdr *memhdr;
> @@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>  	fprintf(f, "  common_pool_count=%u\n", common_count);
> 
>  	/* sum and dump statistics */
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  	rte_mempool_ops_get_info(mp, &info);
>  	memset(&sum, 0, sizeof(sum));
>  	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 3725a72951..89640e48e5 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -56,14 +56,14 @@ extern "C" {
>  #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header
> cookie. */
>  #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer
> cookie.*/
> 
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  /**
>   * A structure that stores the mempool statistics (per-lcore).
>   * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are
> not
>   * captured since they can be calculated from other stats.
>   * For example: put_cache_objs = put_objs - put_common_pool_objs.
>   */
> -struct rte_mempool_debug_stats {
> +struct rte_mempool_stats {
>  	uint64_t put_bulk;             /**< Number of puts. */
>  	uint64_t put_objs;             /**< Number of objects
> successfully put. */
>  	uint64_t put_common_pool_bulk; /**< Number of bulks enqueued in
> common pool. */
> @@ -237,9 +237,9 @@ struct rte_mempool {
>  	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
>  	struct rte_mempool_memhdr_list mem_list; /**< List of memory
> chunks */
> 
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  	/** Per-lcore statistics. */
> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> +	struct rte_mempool_stats stats[RTE_MAX_LCORE];
>  #endif
>  }  __rte_cache_aligned;
> 
> @@ -293,16 +293,16 @@ struct rte_mempool {
>  	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
>  	)
>  /**
> - * @internal When debug is enabled, store some statistics.
> + * @internal When stats are enabled, store some statistics.
>   *
>   * @param mp
>   *   Pointer to the memory pool.
>   * @param name
>   *   Name of the statistics field to increment in the memory pool.
>   * @param n
> - *   Number to add to the object-oriented statistics.
> + *   Number to add to the statistics.
>   */
> -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
>  		unsigned __lcore_id = rte_lcore_id();           \
>  		if (__lcore_id < RTE_MAX_LCORE) {               \
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] mempool: split statistics from debug
  2022-10-30 14:04 ` Morten Brørup
@ 2022-10-30 16:12   ` Stephen Hemminger
  2022-10-30 20:29     ` Morten Brørup
  0 siblings, 1 reply; 39+ messages in thread
From: Stephen Hemminger @ 2022-10-30 16:12 UTC (permalink / raw)
  To: Morten Brørup; +Cc: olivier.matz, andrew.rybchenko, dev

On Sun, 30 Oct 2022 15:04:18 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > Sent: Sunday, 30 October 2022 12.55
> > 
> > Split statistics from debug, to make mempool statistics available
> > without
> > the performance cost of continuously validating the cookies in the
> > mempool
> > elements.  
> 
> mempool_perf_autotest shows that the rate_persec drops to a third (-66 %) when enabling full mempool debug - quite prohibitive!
> 
> With this patch, the performance cost is much lower. I don't have detailed test results, but the initial tests without mempool cache show a performance drop of -11 %. Significant, but not prohibitive.


One trick to avoid conditional in fast path would be to add a dummy stats[] per core.

Another would be to move the fast path get/put stats into the mempool_cache.
The current model has stats[] per cpu always on another cache line.

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 1f5707f46a21..87905b7286a6 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -236,8 +236,10 @@ struct rte_mempool {
        struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-       /** Per-lcore statistics. */
-       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+       /** Per-lcore statistics.
+        * Allocate one additional per-cpu slot for non-DPDK threads
+        */
+       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -302,10 +304,7 @@ struct rte_mempool {
  */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-               unsigned __lcore_id = rte_lcore_id();           \
-               if (__lcore_id < RTE_MAX_LCORE) {               \
-                       mp->stats[__lcore_id].name += n;        \
-               }                                               \
+       (mp)->stats[rte_lcore_id()].name += n;
        } while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH] mempool: split statistics from debug
  2022-10-30 16:12   ` Stephen Hemminger
@ 2022-10-30 20:29     ` Morten Brørup
  0 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-10-30 20:29 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: olivier.matz, andrew.rybchenko, dev, thomas

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Sunday, 30 October 2022 17.13
> 
> On Sun, 30 Oct 2022 15:04:18 +0100
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > > Sent: Sunday, 30 October 2022 12.55
> > >
> > > Split statistics from debug, to make mempool statistics available
> > > without
> > > the performance cost of continuously validating the cookies in the
> > > mempool
> > > elements.
> >
> > mempool_perf_autotest shows that the rate_persec drops to a third (-
> 66 %) when enabling full mempool debug - quite prohibitive!
> >
> > With this patch, the performance cost is much lower. I don't have
> detailed test results, but the initial tests without mempool cache show
> a performance drop of -11 %. Significant, but not prohibitive.
> 
> 
> One trick to avoid conditional in fast path would be to add a dummy
> stats[] per core.
> 
> Another would be to move the fast path get/put stats into the
> mempool_cache.
> The current model has stats[] per cpu always on another cache line.

I submitted a patch to move the likely get/put stats to the mempool cache [1], but retracted it shortly after. I realized that splitting the stats from the debug cookies had much higher performance effect, so we should do this first. We can move statistics counters into the mempool_cache as part two.

[1]: https://patchwork.dpdk.org/project/dpdk/patch/20221028064152.98341-2-mb@smartsharesystems.com/

Also, it might be too late for 22.11 to move stats to the mempool cache. However, splitting the stats from the debug cookies is extremely simple, so that should be accepted for 22.11 (if reviewed properly).

> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1f5707f46a21..87905b7286a6 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -236,8 +236,10 @@ struct rte_mempool {
>         struct rte_mempool_memhdr_list mem_list; /**< List of memory
> chunks */
> 
>  #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
> -       /** Per-lcore statistics. */
> -       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> +       /** Per-lcore statistics.
> +        * Allocate one additional per-cpu slot for non-DPDK threads
> +        */
> +       struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];

Excellent! As a bonus, this also fixes a bug in the statistics: Non-DPDK thread operations were not counted.

>  #endif
>  }  __rte_cache_aligned;
> 
> @@ -302,10 +304,7 @@ struct rte_mempool {
>   */
>  #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> -               unsigned __lcore_id = rte_lcore_id();           \
> -               if (__lcore_id < RTE_MAX_LCORE) {               \
> -                       mp->stats[__lcore_id].name += n;        \
> -               }                                               \
> +       (mp)->stats[rte_lcore_id()].name += n;

I suppose the index must be offset by one, for rte_lcore_id() returning LCORE_ID_ANY (=UINT32_MAX) to be stored at offset zero:
+       (mp)->stats[rte_lcore_id() + 1].name += n;

>         } while (0)
>  #else
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)

I have marked this patch as Changes Requested, and will submit a v2 patch series with this improvement - but not today, considering the local time.

NB: I am aware that the loop in rte_mempool_dump() in rte_mempool.c must also be updated to account for the additional slot in the stats array. I'll check if there are similar effects elsewhere in the library.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v2 1/3] mempool: split statistics from debug
  2022-10-30 11:54 [PATCH] mempool: split statistics from debug Morten Brørup
  2022-10-30 14:04 ` Morten Brørup
@ 2022-10-31 11:26 ` Morten Brørup
  2022-10-31 11:26   ` [PATCH v2 2/3] mempool: include non-DPDK threads in statistics Morten Brørup
                     ` (2 more replies)
  1 sibling, 3 replies; 39+ messages in thread
From: Morten Brørup @ 2022-10-31 11:26 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, stephen, jerinj, bruce.richardson
  Cc: thomas, dev, Morten Brørup

Split statistics from debug, to make mempool statistics available without
the performance cost of continuously validating the cookies in the mempool
elements.

mempool_perf_autotest shows the follwing change in rate_persec.

When enabling mempool debug without this patch:
-28.1 % and -74.0 %, respectively without and with cache.

When enabling mempool stats (but not debug) with this patch:
-5.8 % and -21.2 %, respectively without and with cache.

v2:
* Fix checkpatch warning:
  Use C style comments in rte_include.h, not C++ style.
* Do not rename the rte_mempool_debug_stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 config/rte_config.h       | 2 ++
 lib/mempool/rte_mempool.c | 6 +++---
 lib/mempool/rte_mempool.h | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index ae56a86394..3c4876d434 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -47,6 +47,8 @@
 
 /* mempool defines */
 #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+/* RTE_LIBRTE_MEMPOOL_STATS is not set */
+/* RTE_LIBRTE_MEMPOOL_DEBUG is not set */
 
 /* mbuf defines */
 #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..62d1ce764e 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -818,7 +818,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
@@ -1221,7 +1221,7 @@ rte_mempool_audit(struct rte_mempool *mp)
 void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	struct rte_mempool_info info;
 	struct rte_mempool_debug_stats sum;
 	unsigned lcore_id;
@@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "  common_pool_count=%u\n", common_count);
 
 	/* sum and dump statistics */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..9c4bf5549f 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -56,7 +56,7 @@ extern "C" {
 #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header cookie. */
 #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer cookie.*/
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 /**
  * A structure that stores the mempool statistics (per-lcore).
  * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are not
@@ -237,7 +237,7 @@ struct rte_mempool {
 	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	/** Per-lcore statistics. */
 	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
 #endif
@@ -302,7 +302,7 @@ struct rte_mempool {
  * @param n
  *   Number to add to the object-oriented statistics.
  */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
 		unsigned __lcore_id = rte_lcore_id();           \
 		if (__lcore_id < RTE_MAX_LCORE) {               \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-10-31 11:26 ` [PATCH v2 1/3] " Morten Brørup
@ 2022-10-31 11:26   ` Morten Brørup
  2022-11-02  7:52     ` Mattias Rönnblom
  2022-10-31 11:26   ` [PATCH v2 3/3] mempool: use cache for frequently updated statistics Morten Brørup
  2022-11-04 11:17   ` [PATCH v3 1/3] mempool: split stats from debug Morten Brørup
  2 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-10-31 11:26 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, stephen, jerinj, bruce.richardson
  Cc: thomas, dev, Morten Brørup

Offset the stats array index by one, and count non-DPDK threads at index
zero.

This patch provides two benefits:
* Non-DPDK threads are also included in the statistics.
* A conditional in the fast path is removed. Static branch prediction was
  correct, so the performance improvement is negligible.

v2:
* New. No v1 of this patch in the series.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  2 +-
 lib/mempool/rte_mempool.h | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 62d1ce764e..e6208125e0 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
 		sum.put_objs += mp->stats[lcore_id].put_objs;
 		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 9c4bf5549f..16e7e62e3c 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -238,8 +238,11 @@ struct rte_mempool {
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	/** Per-lcore statistics.
+	 *
+	 * Offset by one, to include non-DPDK threads.
+	 */
+	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -304,10 +307,7 @@ struct rte_mempool {
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-		unsigned __lcore_id = rte_lcore_id();           \
-		if (__lcore_id < RTE_MAX_LCORE) {               \
-			mp->stats[__lcore_id].name += n;        \
-		}                                               \
+		(mp)->stats[rte_lcore_id() + 1].name += n;      \
 	} while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v2 3/3] mempool: use cache for frequently updated statistics
  2022-10-31 11:26 ` [PATCH v2 1/3] " Morten Brørup
  2022-10-31 11:26   ` [PATCH v2 2/3] mempool: include non-DPDK threads in statistics Morten Brørup
@ 2022-10-31 11:26   ` Morten Brørup
  2022-11-02  8:01     ` Mattias Rönnblom
  2022-11-04 11:17   ` [PATCH v3 1/3] mempool: split stats from debug Morten Brørup
  2 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-10-31 11:26 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, stephen, jerinj, bruce.richardson
  Cc: thomas, dev, Morten Brørup

When built with statistics enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
performance of mempools with caches is improved as follows.

When accessing objects in the mempool, either the put_bulk and put_objs or
the get_success_bulk and get_success_objs statistics counters are likely
to be incremented.

By adding an alternative set of these counters to the mempool cache
structure, accesing the dedicated statistics structure is avoided in the
likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
Updating some statistics counters in the same cache line has lower
performance cost than accessing the statistics counters in the dedicated
statistics structure, which resides in another cache line.

mempool_perf_autotest with this patch shows the follwing change in
rate_persec.

Compared to only spliting statistics from debug:
+1.5 % and +14.4 %, respectively without and with cache.

Compared to not enabling mempool stats:
-4.4 % and -9.9 %, respectively without and with cache.

v2:
* Move the statistics counters into a stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  9 +++++
 lib/mempool/rte_mempool.h | 73 ++++++++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index e6208125e0..a18e39af04 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
+	if (mp->cache_size != 0) {
+		/* Add the statistics stored in the mempool caches. */
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
+		}
+	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
 	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 16e7e62e3c..5806e75609 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,21 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+	uint32_t unused0;
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	/*
+	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
+	 * providing faster update access when using a mempool cache.
+	 */
+	struct {
+		uint64_t put_bulk;          /**< Number of puts. */
+		uint64_t put_objs;          /**< Number of objects successfully put. */
+		uint64_t get_success_bulk;  /**< Successful allocation number. */
+		uint64_t get_success_objs;  /**< Objects successfully allocated. */
+	} stats;                        /**< Statistics */
+#else
+	uint64_t unused1[4];
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -296,14 +311,14 @@ struct rte_mempool {
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats is enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
@@ -312,6 +327,23 @@ struct rte_mempool {
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
 #endif
+/**
+ * @internal When stats is enabled, store some statistics.
+ *
+ * @param cache
+ *   Pointer to the memory pool cache.
+ * @param name
+ *   Name of the statistics field to increment in the memory pool cache.
+ * @param n
+ *   Number to add to the statistics.
+ */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do { \
+		(cache)->stats.name += n;               \
+	} while (0)
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif
 
 /**
  * @internal Calculate the size of the mempool header.
@@ -1327,13 +1359,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request itself is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1358,6 +1394,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1464,8 +1506,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 		return 0;
 	}
@@ -1494,8 +1536,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
@@ -1517,8 +1559,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		if (likely(cache != NULL)) {
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		} else {
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		}
 	}
 
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-10-31 11:26   ` [PATCH v2 2/3] mempool: include non-DPDK threads in statistics Morten Brørup
@ 2022-11-02  7:52     ` Mattias Rönnblom
  2022-11-02  9:09       ` Morten Brørup
  0 siblings, 1 reply; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-02  7:52 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

On 2022-10-31 12:26, Morten Brørup wrote:
> Offset the stats array index by one, and count non-DPDK threads at index
> zero.
> 
> This patch provides two benefits:
> * Non-DPDK threads are also included in the statistics.
> * A conditional in the fast path is removed. Static branch prediction was
>    correct, so the performance improvement is negligible.
> 
> v2:
> * New. No v1 of this patch in the series.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   lib/mempool/rte_mempool.c |  2 +-
>   lib/mempool/rte_mempool.h | 12 ++++++------
>   2 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 62d1ce764e..e6208125e0 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
>   	rte_mempool_ops_get_info(mp, &info);
>   	memset(&sum, 0, sizeof(sum));
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
>   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>   		sum.put_objs += mp->stats[lcore_id].put_objs;
>   		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 9c4bf5549f..16e7e62e3c 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -238,8 +238,11 @@ struct rte_mempool {
>   	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
>   
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> -	/** Per-lcore statistics. */
> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> +	/** Per-lcore statistics.
> +	 *
> +	 * Offset by one, to include non-DPDK threads.
> +	 */
> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>   #endif
>   }  __rte_cache_aligned;
>   
> @@ -304,10 +307,7 @@ struct rte_mempool {
>    */
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> -		unsigned __lcore_id = rte_lcore_id();           \
> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> -			mp->stats[__lcore_id].name += n;        \
> -		}                                               \
> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \

This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an 
unregistered non-EAL thread? Might be worth a comment, or better a 
rewrite with an explicit LCORE_ID_ANY comparison.

You anyways need a conditional. An atomic add must be used in the 
unregistered EAL thread case.

>   	} while (0)
>   #else
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 3/3] mempool: use cache for frequently updated statistics
  2022-10-31 11:26   ` [PATCH v2 3/3] mempool: use cache for frequently updated statistics Morten Brørup
@ 2022-11-02  8:01     ` Mattias Rönnblom
  2022-11-02  9:29       ` Morten Brørup
  0 siblings, 1 reply; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-02  8:01 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

On 2022-10-31 12:26, Morten Brørup wrote:
> When built with statistics enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> performance of mempools with caches is improved as follows.
> 
> When accessing objects in the mempool, either the put_bulk and put_objs or
> the get_success_bulk and get_success_objs statistics counters are likely
> to be incremented.
> 
> By adding an alternative set of these counters to the mempool cache
> structure, accesing the dedicated statistics structure is avoided in the
> likely cases where these counters are incremented.
> 
> The trick here is that the cache line holding the mempool cache structure
> is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
> Updating some statistics counters in the same cache line has lower
> performance cost than accessing the statistics counters in the dedicated
> statistics structure, which resides in another cache line.
> 
> mempool_perf_autotest with this patch shows the follwing change in
> rate_persec.
> 
> Compared to only spliting statistics from debug:
> +1.5 % and +14.4 %, respectively without and with cache.
> 
> Compared to not enabling mempool stats:
> -4.4 % and -9.9 %, respectively without and with cache.
> 
> v2:
> * Move the statistics counters into a stats structure.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   lib/mempool/rte_mempool.c |  9 +++++
>   lib/mempool/rte_mempool.h | 73 ++++++++++++++++++++++++++++++++-------
>   2 files changed, 69 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index e6208125e0..a18e39af04 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>   		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
>   		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
>   	}
> +	if (mp->cache_size != 0) {
> +		/* Add the statistics stored in the mempool caches. */
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
> +			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
> +			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
> +			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
> +		}
> +	}
>   	fprintf(f, "  stats:\n");
>   	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
>   	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 16e7e62e3c..5806e75609 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -86,6 +86,21 @@ struct rte_mempool_cache {
>   	uint32_t size;	      /**< Size of the cache */
>   	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>   	uint32_t len;	      /**< Current cache count */
> +	uint32_t unused0;
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +	/*
> +	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
> +	 * providing faster update access when using a mempool cache.
> +	 */
> +	struct {
> +		uint64_t put_bulk;          /**< Number of puts. */
> +		uint64_t put_objs;          /**< Number of objects successfully put. */
> +		uint64_t get_success_bulk;  /**< Successful allocation number. */
> +		uint64_t get_success_objs;  /**< Objects successfully allocated. */
> +	} stats;                        /**< Statistics */
> +#else
> +	uint64_t unused1[4];

Are a particular DPDK version supposed to be ABI compatible with itself, 
with different configuration options? E.g., with and without 
RTE_LIBRTE_MEMPOOL_STATS. Is that why you have those 4 unused uint64_ts?

> +#endif
>   	/**
>   	 * Cache objects
>   	 *
> @@ -296,14 +311,14 @@ struct rte_mempool {
>   	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
>   	)
>   /**
> - * @internal When debug is enabled, store some statistics.
> + * @internal When stats is enabled, store some statistics.
>    *
>    * @param mp
>    *   Pointer to the memory pool.
>    * @param name
>    *   Name of the statistics field to increment in the memory pool.
>    * @param n
> - *   Number to add to the object-oriented statistics.
> + *   Number to add to the statistics.
>    */
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> @@ -312,6 +327,23 @@ struct rte_mempool {
>   #else
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
>   #endif
> +/**
> + * @internal When stats is enabled, store some statistics.
> + *
> + * @param cache
> + *   Pointer to the memory pool cache.
> + * @param name
> + *   Name of the statistics field to increment in the memory pool cache.
> + * @param n
> + *   Number to add to the statistics.
> + */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do { \
> +		(cache)->stats.name += n;               \
> +	} while (0)
> +#else
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)

Somewhat unrelated comment: maybe <rte_common.h> should have a RTE_NOP 
macro.

> +#endif
>   
>   /**
>    * @internal Calculate the size of the mempool header.
> @@ -1327,13 +1359,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>   {
>   	void **cache_objs;
>   
> +	/* No cache provided */
> +	if (unlikely(cache == NULL))
> +		goto driver_enqueue;
> +
>   	/* increment stat now, adding in mempool always success */
> -	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>   
> -	/* No cache provided or the request itself is too big for the cache */
> -	if (unlikely(cache == NULL || n > cache->flushthresh))
> -		goto driver_enqueue;
> +	/* The request itself is too big for the cache */
> +	if (unlikely(n > cache->flushthresh))
> +		goto driver_enqueue_stats_incremented;
>   
>   	/*
>   	 * The cache follows the following algorithm:
> @@ -1358,6 +1394,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>   
>   driver_enqueue:
>   
> +	/* increment stat now, adding in mempool always success */
> +	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> +	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +
> +driver_enqueue_stats_incremented:
> +
>   	/* push objects to the backend */
>   	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
>   }
> @@ -1464,8 +1506,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   	if (remaining == 0) {
>   		/* The entire request is satisfied from the cache. */
>   
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
>   		return 0;
>   	}
> @@ -1494,8 +1536,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   
>   	cache->len = cache->size;
>   
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
>   	return 0;
>   
> @@ -1517,8 +1559,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
>   		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>   	} else {
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		if (likely(cache != NULL)) {
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +		} else {
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		}
>   	}
>   
>   	return ret;

^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-02  7:52     ` Mattias Rönnblom
@ 2022-11-02  9:09       ` Morten Brørup
  2022-11-02 15:19         ` Stephen Hemminger
  2022-11-02 17:53         ` Mattias Rönnblom
  0 siblings, 2 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-02  9:09 UTC (permalink / raw)
  To: Mattias Rönnblom, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 2 November 2022 08.53
> 
> On 2022-10-31 12:26, Morten Brørup wrote:
> > Offset the stats array index by one, and count non-DPDK threads at
> index
> > zero.
> >
> > This patch provides two benefits:
> > * Non-DPDK threads are also included in the statistics.
> > * A conditional in the fast path is removed. Static branch prediction
> was
> >    correct, so the performance improvement is negligible.
> >
> > v2:
> > * New. No v1 of this patch in the series.
> >
> > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> >   lib/mempool/rte_mempool.c |  2 +-
> >   lib/mempool/rte_mempool.h | 12 ++++++------
> >   2 files changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 62d1ce764e..e6208125e0 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
> *mp)
> >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >   	rte_mempool_ops_get_info(mp, &info);
> >   	memset(&sum, 0, sizeof(sum));
> > -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
> >   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >   		sum.put_objs += mp->stats[lcore_id].put_objs;
> >   		sum.put_common_pool_bulk += mp-
> >stats[lcore_id].put_common_pool_bulk;
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 9c4bf5549f..16e7e62e3c 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -238,8 +238,11 @@ struct rte_mempool {
> >   	struct rte_mempool_memhdr_list mem_list; /**< List of memory
> chunks */
> >
> >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > -	/** Per-lcore statistics. */
> > -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> > +	/** Per-lcore statistics.
> > +	 *
> > +	 * Offset by one, to include non-DPDK threads.
> > +	 */
> > +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >   #endif
> >   }  __rte_cache_aligned;
> >
> > @@ -304,10 +307,7 @@ struct rte_mempool {
> >    */
> >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> > -		unsigned __lcore_id = rte_lcore_id();           \
> > -		if (__lcore_id < RTE_MAX_LCORE) {               \
> > -			mp->stats[__lcore_id].name += n;        \
> > -		}                                               \
> > +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> 
> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an
> unregistered non-EAL thread? Might be worth a comment, or better a
> rewrite with an explicit LCORE_ID_ANY comparison.

The purpose of this patch is to avoid the comparison here.

Yes, it relies on the wrap to zero, and these conditions:
1. LCORE_ID_ANY being UINT32_MAX, and
2. the return type of rte_lcore_id() being unsigned int, and
3. unsigned int being uint32_t.

When I wrote this, I considered it safe to assume that LCORE_ID_ANY will remain the unsigned equivalent of -1 using the return type of rte_lcore_id(). In other words: If the return type of rte_lcore_id() should change from 32 to 64 bit, LCORE_ID_ANY would be updated accordingly to UINT64_MAX.

Because of this assumption, I didn't use [(rte_lcore_id() + 1) & UINT32_MAX], but just [rte_lcore_id() + 1].

I struggled writing an appropriate comment without making it unacceptably long, but eventually gave up, and settled for the one-line comment in the structure only.

> 
> You anyways need a conditional. An atomic add must be used in the
> unregistered EAL thread case.

Good point: The "+= n" must be atomic for non-isolated threads.

I just took a look at how software maintained stats are handled elsewhere, and the first thing I found, is the IOAT DMA driver, which also seems to be using non-atomic increment [1] regardless if used by a DPDK thread or not.

[1]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228

However, doing it wrong elsewhere doesn't make it correct doing it wrong here too. :-)

Atomic increments are costly, so I would rather follow your suggestion and reintroduce the comparison. How about:

#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
    unsigned int __lcore_id = rte_lcore_id(); \
    if (likely(__lcore_id < RTE_MAX_LCORE)) { \
        (mp)->stats[__lcore_id].name += n; \
    } else {
        rte_atomic64_add( \
                (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name), n);\
    } \
}

And the structure comment could be:
 * Plus one, for non-DPDK threads.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v2 3/3] mempool: use cache for frequently updated statistics
  2022-11-02  8:01     ` Mattias Rönnblom
@ 2022-11-02  9:29       ` Morten Brørup
  2022-11-02 17:55         ` Mattias Rönnblom
  0 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-11-02  9:29 UTC (permalink / raw)
  To: Mattias Rönnblom, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson, thomas
  Cc: dev

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 2 November 2022 09.01
> 
> On 2022-10-31 12:26, Morten Brørup wrote:

[...]

> > +++ b/lib/mempool/rte_mempool.h
> > @@ -86,6 +86,21 @@ struct rte_mempool_cache {
> >   	uint32_t size;	      /**< Size of the cache */
> >   	uint32_t flushthresh; /**< Threshold before we flush excess
> elements */
> >   	uint32_t len;	      /**< Current cache count */
> > +	uint32_t unused0;
> > +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> > +	/*
> > +	 * Alternative location for the most frequently updated mempool
> statistics (per-lcore),
> > +	 * providing faster update access when using a mempool cache.
> > +	 */
> > +	struct {
> > +		uint64_t put_bulk;          /**< Number of puts. */
> > +		uint64_t put_objs;          /**< Number of objects
> successfully put. */
> > +		uint64_t get_success_bulk;  /**< Successful allocation
> number. */
> > +		uint64_t get_success_objs;  /**< Objects successfully
> allocated. */
> > +	} stats;                        /**< Statistics */
> > +#else
> > +	uint64_t unused1[4];
> 
> Are a particular DPDK version supposed to be ABI compatible with
> itself,
> with different configuration options? E.g., with and without
> RTE_LIBRTE_MEMPOOL_STATS. Is that why you have those 4 unused
> uint64_ts?

Valid point: There was no ABI compatibility between with and without RTE_LIBRTE_MEMPOOL_STATS before this patch, so there is no need to add partial ABI compatibility here.

The #else part of this structure should be removed.

Does anyone disagree?

> > +#endif


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-02  9:09       ` Morten Brørup
@ 2022-11-02 15:19         ` Stephen Hemminger
  2022-11-02 15:37           ` Morten Brørup
  2022-11-02 17:53         ` Mattias Rönnblom
  1 sibling, 1 reply; 39+ messages in thread
From: Stephen Hemminger @ 2022-11-02 15:19 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Mattias Rönnblom, olivier.matz, andrew.rybchenko, jerinj,
	bruce.richardson, thomas, dev

On Wed, 2 Nov 2022 10:09:26 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > Sent: Wednesday, 2 November 2022 08.53
> > 
> > On 2022-10-31 12:26, Morten Brørup wrote:  
> > > Offset the stats array index by one, and count non-DPDK threads at  
> > index  
> > > zero.
> > >
> > > This patch provides two benefits:
> > > * Non-DPDK threads are also included in the statistics.
> > > * A conditional in the fast path is removed. Static branch prediction  
> > was  
> > >    correct, so the performance improvement is negligible.
> > >
> > > v2:
> > > * New. No v1 of this patch in the series.
> > >
> > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> > >   lib/mempool/rte_mempool.c |  2 +-
> > >   lib/mempool/rte_mempool.h | 12 ++++++------
> > >   2 files changed, 7 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 62d1ce764e..e6208125e0 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool  
> > *mp)  
> > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > >   	rte_mempool_ops_get_info(mp, &info);
> > >   	memset(&sum, 0, sizeof(sum));
> > > -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
> > >   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> > >   		sum.put_objs += mp->stats[lcore_id].put_objs;
> > >   		sum.put_common_pool_bulk += mp-
> > >stats[lcore_id].put_common_pool_bulk;
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 9c4bf5549f..16e7e62e3c 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -238,8 +238,11 @@ struct rte_mempool {
> > >   	struct rte_mempool_memhdr_list mem_list; /**< List of memory  
> > chunks */  
> > >
> > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > -	/** Per-lcore statistics. */
> > > -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> > > +	/** Per-lcore statistics.
> > > +	 *
> > > +	 * Offset by one, to include non-DPDK threads.
> > > +	 */
> > > +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> > >   #endif
> > >   }  __rte_cache_aligned;
> > >
> > > @@ -304,10 +307,7 @@ struct rte_mempool {
> > >    */
> > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > >   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> > > -		unsigned __lcore_id = rte_lcore_id();           \
> > > -		if (__lcore_id < RTE_MAX_LCORE) {               \
> > > -			mp->stats[__lcore_id].name += n;        \
> > > -		}                                               \
> > > +		(mp)->stats[rte_lcore_id() + 1].name += n;      \  
> > 
> > This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an
> > unregistered non-EAL thread? Might be worth a comment, or better a
> > rewrite with an explicit LCORE_ID_ANY comparison.  
> 
> The purpose of this patch is to avoid the comparison here.
> 
> Yes, it relies on the wrap to zero, and these conditions:
> 1. LCORE_ID_ANY being UINT32_MAX, and
> 2. the return type of rte_lcore_id() being unsigned int, and
> 3. unsigned int being uint32_t.
> 
> When I wrote this, I considered it safe to assume that LCORE_ID_ANY will remain the unsigned equivalent of -1 using the return type of rte_lcore_id(). In other words: If the return type of rte_lcore_id() should change from 32 to 64 bit, LCORE_ID_ANY would be updated accordingly to UINT64_MAX.
> 
> Because of this assumption, I didn't use [(rte_lcore_id() + 1) & UINT32_MAX], but just [rte_lcore_id() + 1].
> 
> I struggled writing an appropriate comment without making it unacceptably long, but eventually gave up, and settled for the one-line comment in the structure only.
> 
> > 
> > You anyways need a conditional. An atomic add must be used in the
> > unregistered EAL thread case.  
> 
> Good point: The "+= n" must be atomic for non-isolated threads.
> 
> I just took a look at how software maintained stats are handled elsewhere, and the first thing I found, is the IOAT DMA driver, which also seems to be using non-atomic increment [1] regardless if used by a DPDK thread or not.
> 
> [1]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> 
> However, doing it wrong elsewhere doesn't make it correct doing it wrong here too. :-)
> 
> Atomic increments are costly, so I would rather follow your suggestion and reintroduce the comparison. How about:
> 
> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>     unsigned int __lcore_id = rte_lcore_id(); \
>     if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>         (mp)->stats[__lcore_id].name += n; \
>     } else {
>         rte_atomic64_add( \
>                 (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name), n);\
>     } \
> }
> 
> And the structure comment could be:
>  * Plus one, for non-DPDK threads.
> 

Or use rte_lcore_index() instead of rte_lcore_id()


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-02 15:19         ` Stephen Hemminger
@ 2022-11-02 15:37           ` Morten Brørup
  0 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-02 15:37 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Mattias Rönnblom, olivier.matz, andrew.rybchenko, jerinj,
	bruce.richardson, thomas, dev



Med venlig hilsen / Kind regards,
-Morten Brørup


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, 2 November 2022 16.19
> To: Morten Brørup
> Cc: Mattias Rönnblom; olivier.matz@6wind.com;
> andrew.rybchenko@oktetlabs.ru; jerinj@marvell.com;
> bruce.richardson@intel.com; thomas@monjalon.net; dev@dpdk.org
> Subject: Re: [PATCH v2 2/3] mempool: include non-DPDK threads in
> statistics
> 
> On Wed, 2 Nov 2022 10:09:26 +0100
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > > Sent: Wednesday, 2 November 2022 08.53
> > >
> > > On 2022-10-31 12:26, Morten Brørup wrote:
> > > > Offset the stats array index by one, and count non-DPDK threads
> at
> > > index
> > > > zero.
> > > >
> > > > This patch provides two benefits:
> > > > * Non-DPDK threads are also included in the statistics.
> > > > * A conditional in the fast path is removed. Static branch
> prediction
> > > was
> > > >    correct, so the performance improvement is negligible.
> > > >
> > > > v2:
> > > > * New. No v1 of this patch in the series.
> > > >
> > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > ---
> > > >   lib/mempool/rte_mempool.c |  2 +-
> > > >   lib/mempool/rte_mempool.h | 12 ++++++------
> > > >   2 files changed, 7 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c
> b/lib/mempool/rte_mempool.c
> > > > index 62d1ce764e..e6208125e0 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
> rte_mempool
> > > *mp)
> > > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > >   	rte_mempool_ops_get_info(mp, &info);
> > > >   	memset(&sum, 0, sizeof(sum));
> > > > -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > > > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
> lcore_id++) {
> > > >   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> > > >   		sum.put_objs += mp->stats[lcore_id].put_objs;
> > > >   		sum.put_common_pool_bulk += mp-
> > > >stats[lcore_id].put_common_pool_bulk;
> > > > diff --git a/lib/mempool/rte_mempool.h
> b/lib/mempool/rte_mempool.h
> > > > index 9c4bf5549f..16e7e62e3c 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -238,8 +238,11 @@ struct rte_mempool {
> > > >   	struct rte_mempool_memhdr_list mem_list; /**< List of
> memory
> > > chunks */
> > > >
> > > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > > -	/** Per-lcore statistics. */
> > > > -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> > > > +	/** Per-lcore statistics.
> > > > +	 *
> > > > +	 * Offset by one, to include non-DPDK threads.
> > > > +	 */
> > > > +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> > > >   #endif
> > > >   }  __rte_cache_aligned;
> > > >
> > > > @@ -304,10 +307,7 @@ struct rte_mempool {
> > > >    */
> > > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > >   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
> > > > -		unsigned __lcore_id = rte_lcore_id();           \
> > > > -		if (__lcore_id < RTE_MAX_LCORE) {               \
> > > > -			mp->stats[__lcore_id].name += n;        \
> > > > -		}                                               \
> > > > +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> > >
> > > This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
> an
> > > unregistered non-EAL thread? Might be worth a comment, or better a
> > > rewrite with an explicit LCORE_ID_ANY comparison.
> >
> > The purpose of this patch is to avoid the comparison here.
> >
> > Yes, it relies on the wrap to zero, and these conditions:
> > 1. LCORE_ID_ANY being UINT32_MAX, and
> > 2. the return type of rte_lcore_id() being unsigned int, and
> > 3. unsigned int being uint32_t.
> >
> > When I wrote this, I considered it safe to assume that LCORE_ID_ANY
> will remain the unsigned equivalent of -1 using the return type of
> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> accordingly to UINT64_MAX.
> >
> > Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> UINT32_MAX], but just [rte_lcore_id() + 1].
> >
> > I struggled writing an appropriate comment without making it
> unacceptably long, but eventually gave up, and settled for the one-line
> comment in the structure only.
> >
> > >
> > > You anyways need a conditional. An atomic add must be used in the
> > > unregistered EAL thread case.
> >
> > Good point: The "+= n" must be atomic for non-isolated threads.
> >
> > I just took a look at how software maintained stats are handled
> elsewhere, and the first thing I found, is the IOAT DMA driver, which
> also seems to be using non-atomic increment [1] regardless if used by a
> DPDK thread or not.
> >
> > [1]: https://elixir.bootlin.com/dpdk/v22.11-
> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >
> > However, doing it wrong elsewhere doesn't make it correct doing it
> wrong here too. :-)
> >
> > Atomic increments are costly, so I would rather follow your
> suggestion and reintroduce the comparison. How about:
> >
> > #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >     unsigned int __lcore_id = rte_lcore_id(); \
> >     if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >         (mp)->stats[__lcore_id].name += n; \
> >     } else {
> >         rte_atomic64_add( \
> >                 (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name),
> n);\
> >     } \
> > }
> >
> > And the structure comment could be:
> >  * Plus one, for non-DPDK threads.
> >
> 
> Or use rte_lcore_index() instead of rte_lcore_id()
> 

rte_lcore_index() is not inline, so it would impact performance. And looking at its source code [2], it compares and branches anyway. It would effectively just be a longwinded way of converting the rte_lcore_id() return type from unsigned to signed int.

[2]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/lib/eal/common/eal_common_lcore.c#L27


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-02  9:09       ` Morten Brørup
  2022-11-02 15:19         ` Stephen Hemminger
@ 2022-11-02 17:53         ` Mattias Rönnblom
  2022-11-03  8:59           ` Morten Brørup
  1 sibling, 1 reply; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-02 17:53 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

On 2022-11-02 10:09, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 2 November 2022 08.53
>>
>> On 2022-10-31 12:26, Morten Brørup wrote:
>>> Offset the stats array index by one, and count non-DPDK threads at
>> index
>>> zero.
>>>
>>> This patch provides two benefits:
>>> * Non-DPDK threads are also included in the statistics.
>>> * A conditional in the fast path is removed. Static branch prediction
>> was
>>>     correct, so the performance improvement is negligible.
>>>
>>> v2:
>>> * New. No v1 of this patch in the series.
>>>
>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>>> ---
>>>    lib/mempool/rte_mempool.c |  2 +-
>>>    lib/mempool/rte_mempool.h | 12 ++++++------
>>>    2 files changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
>>> index 62d1ce764e..e6208125e0 100644
>>> --- a/lib/mempool/rte_mempool.c
>>> +++ b/lib/mempool/rte_mempool.c
>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
>> *mp)
>>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>    	rte_mempool_ops_get_info(mp, &info);
>>>    	memset(&sum, 0, sizeof(sum));
>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
>>>    		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>>>    		sum.put_objs += mp->stats[lcore_id].put_objs;
>>>    		sum.put_common_pool_bulk += mp-
>>> stats[lcore_id].put_common_pool_bulk;
>>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>>> index 9c4bf5549f..16e7e62e3c 100644
>>> --- a/lib/mempool/rte_mempool.h
>>> +++ b/lib/mempool/rte_mempool.h
>>> @@ -238,8 +238,11 @@ struct rte_mempool {
>>>    	struct rte_mempool_memhdr_list mem_list; /**< List of memory
>> chunks */
>>>
>>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>> -	/** Per-lcore statistics. */
>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
>>> +	/** Per-lcore statistics.
>>> +	 *
>>> +	 * Offset by one, to include non-DPDK threads.
>>> +	 */
>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>>>    #endif
>>>    }  __rte_cache_aligned;
>>>
>>> @@ -304,10 +307,7 @@ struct rte_mempool {
>>>     */
>>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>    #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
>>> -		unsigned __lcore_id = rte_lcore_id();           \
>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
>>> -			mp->stats[__lcore_id].name += n;        \
>>> -		}                                               \
>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
>>
>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an
>> unregistered non-EAL thread? Might be worth a comment, or better a
>> rewrite with an explicit LCORE_ID_ANY comparison.
> 
> The purpose of this patch is to avoid the comparison here.
> 
> Yes, it relies on the wrap to zero, and these conditions:
> 1. LCORE_ID_ANY being UINT32_MAX, and
> 2. the return type of rte_lcore_id() being unsigned int, and
> 3. unsigned int being uint32_t.
> 
> When I wrote this, I considered it safe to assume that LCORE_ID_ANY will remain the unsigned equivalent of -1 using the return type of rte_lcore_id(). In other words: If the return type of rte_lcore_id() should change from 32 to 64 bit, LCORE_ID_ANY would be updated accordingly to UINT64_MAX.
> 
> Because of this assumption, I didn't use [(rte_lcore_id() + 1) & UINT32_MAX], but just [rte_lcore_id() + 1].
> 
> I struggled writing an appropriate comment without making it unacceptably long, but eventually gave up, and settled for the one-line comment in the structure only.
> 
>>
>> You anyways need a conditional. An atomic add must be used in the
>> unregistered EAL thread case.
> 
> Good point: The "+= n" must be atomic for non-isolated threads.
> 

If the various unregistered non-EAL threads are run on isolated cores or 
not does not matter.

> I just took a look at how software maintained stats are handled elsewhere, and the first thing I found, is the IOAT DMA driver, which also seems to be using non-atomic increment [1] regardless if used by a DPDK thread or not.
> 
> [1]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> 
> However, doing it wrong elsewhere doesn't make it correct doing it wrong here too. :-)
> 
> Atomic increments are costly, so I would rather follow your suggestion and reintroduce the comparison. How about:
> 
> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>      unsigned int __lcore_id = rte_lcore_id(); \
>      if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>          (mp)->stats[__lcore_id].name += n; \
>      } else {
>          rte_atomic64_add( \
>                  (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name), n);\
>      } \
> }
You are supposed to use GCC C11 intrinsics (e.g., __atomic_fetch_add()).

In addition: technically, you must use an atomic store for the EAL 
thread case (and an atomic load on the reader side), although there are 
tons of examples in DPDK where tearing is ignored. (The generated code 
will likely look the same.)

DPDK coding conventions require there be no braces for a single statement.

Other than that, it looks good.

> 
> And the structure comment could be:
>   * Plus one, for non-DPDK threads.
> 

"Unregistered non-EAL threads". This is the term the EAL documentation uses.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 3/3] mempool: use cache for frequently updated statistics
  2022-11-02  9:29       ` Morten Brørup
@ 2022-11-02 17:55         ` Mattias Rönnblom
  0 siblings, 0 replies; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-02 17:55 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson, thomas
  Cc: dev

On 2022-11-02 10:29, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 2 November 2022 09.01
>>
>> On 2022-10-31 12:26, Morten Brørup wrote:
> 
> [...]
> 
>>> +++ b/lib/mempool/rte_mempool.h
>>> @@ -86,6 +86,21 @@ struct rte_mempool_cache {
>>>    	uint32_t size;	      /**< Size of the cache */
>>>    	uint32_t flushthresh; /**< Threshold before we flush excess
>> elements */
>>>    	uint32_t len;	      /**< Current cache count */
>>> +	uint32_t unused0;
>>> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>>> +	/*
>>> +	 * Alternative location for the most frequently updated mempool
>> statistics (per-lcore),
>>> +	 * providing faster update access when using a mempool cache.
>>> +	 */
>>> +	struct {
>>> +		uint64_t put_bulk;          /**< Number of puts. */
>>> +		uint64_t put_objs;          /**< Number of objects
>> successfully put. */
>>> +		uint64_t get_success_bulk;  /**< Successful allocation
>> number. */
>>> +		uint64_t get_success_objs;  /**< Objects successfully
>> allocated. */
>>> +	} stats;                        /**< Statistics */
>>> +#else
>>> +	uint64_t unused1[4];
>>
>> Are a particular DPDK version supposed to be ABI compatible with
>> itself,
>> with different configuration options? E.g., with and without
>> RTE_LIBRTE_MEMPOOL_STATS. Is that why you have those 4 unused
>> uint64_ts?
> 
> Valid point: There was no ABI compatibility between with and without RTE_LIBRTE_MEMPOOL_STATS before this patch, so there is no need to add partial ABI compatibility here.
> 
> The #else part of this structure should be removed.
> 
> Does anyone disagree?
> 
>>> +#endif
> 

I have no opinion on that matter, but I have another question: if you 
remove 'unused1', should you also remove the unused0 field?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-02 17:53         ` Mattias Rönnblom
@ 2022-11-03  8:59           ` Morten Brørup
  2022-11-04  8:58             ` Mattias Rönnblom
  0 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-11-03  8:59 UTC (permalink / raw)
  To: Mattias Rönnblom, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 2 November 2022 18.53
> 
> On 2022-11-02 10:09, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Wednesday, 2 November 2022 08.53
> >>
> >> On 2022-10-31 12:26, Morten Brørup wrote:
> >>> Offset the stats array index by one, and count non-DPDK threads at
> >> index
> >>> zero.
> >>>
> >>> This patch provides two benefits:
> >>> * Non-DPDK threads are also included in the statistics.
> >>> * A conditional in the fast path is removed. Static branch
> prediction
> >> was
> >>>     correct, so the performance improvement is negligible.
> >>>
> >>> v2:
> >>> * New. No v1 of this patch in the series.
> >>>
> >>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> >>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >>> ---
> >>>    lib/mempool/rte_mempool.c |  2 +-
> >>>    lib/mempool/rte_mempool.h | 12 ++++++------
> >>>    2 files changed, 7 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> >>> index 62d1ce764e..e6208125e0 100644
> >>> --- a/lib/mempool/rte_mempool.c
> >>> +++ b/lib/mempool/rte_mempool.c
> >>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
> >> *mp)
> >>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>    	rte_mempool_ops_get_info(mp, &info);
> >>>    	memset(&sum, 0, sizeof(sum));
> >>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> >>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
> >>>    		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >>>    		sum.put_objs += mp->stats[lcore_id].put_objs;
> >>>    		sum.put_common_pool_bulk += mp-
> >>> stats[lcore_id].put_common_pool_bulk;
> >>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> >>> index 9c4bf5549f..16e7e62e3c 100644
> >>> --- a/lib/mempool/rte_mempool.h
> >>> +++ b/lib/mempool/rte_mempool.h
> >>> @@ -238,8 +238,11 @@ struct rte_mempool {
> >>>    	struct rte_mempool_memhdr_list mem_list; /**< List of
> memory
> >> chunks */
> >>>
> >>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>> -	/** Per-lcore statistics. */
> >>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> >>> +	/** Per-lcore statistics.
> >>> +	 *
> >>> +	 * Offset by one, to include non-DPDK threads.
> >>> +	 */
> >>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >>>    #endif
> >>>    }  __rte_cache_aligned;
> >>>
> >>> @@ -304,10 +307,7 @@ struct rte_mempool {
> >>>     */
> >>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>    #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
> >>> -		unsigned __lcore_id = rte_lcore_id();           \
> >>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> >>> -			mp->stats[__lcore_id].name += n;        \
> >>> -		}                                               \
> >>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> >>
> >> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
> an
> >> unregistered non-EAL thread? Might be worth a comment, or better a
> >> rewrite with an explicit LCORE_ID_ANY comparison.
> >
> > The purpose of this patch is to avoid the comparison here.
> >
> > Yes, it relies on the wrap to zero, and these conditions:
> > 1. LCORE_ID_ANY being UINT32_MAX, and
> > 2. the return type of rte_lcore_id() being unsigned int, and
> > 3. unsigned int being uint32_t.
> >
> > When I wrote this, I considered it safe to assume that LCORE_ID_ANY
> will remain the unsigned equivalent of -1 using the return type of
> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> accordingly to UINT64_MAX.
> >
> > Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> UINT32_MAX], but just [rte_lcore_id() + 1].
> >
> > I struggled writing an appropriate comment without making it
> unacceptably long, but eventually gave up, and settled for the one-line
> comment in the structure only.
> >
> >>
> >> You anyways need a conditional. An atomic add must be used in the
> >> unregistered EAL thread case.
> >
> > Good point: The "+= n" must be atomic for non-isolated threads.
> >
> 
> If the various unregistered non-EAL threads are run on isolated cores
> or not does not matter.

Agree: They all share the same index, and thus may race, regardless which cores they are using.

Rephrasing: The "+= n" must be atomic for the unregistered non-EAL threads.

> 
> > I just took a look at how software maintained stats are handled
> elsewhere, and the first thing I found, is the IOAT DMA driver, which
> also seems to be using non-atomic increment [1] regardless if used by a
> DPDK thread or not.
> >
> > [1]: https://elixir.bootlin.com/dpdk/v22.11-
> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >
> > However, doing it wrong elsewhere doesn't make it correct doing it
> wrong here too. :-)
> >
> > Atomic increments are costly, so I would rather follow your
> suggestion and reintroduce the comparison. How about:
> >
> > #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >      unsigned int __lcore_id = rte_lcore_id(); \
> >      if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >          (mp)->stats[__lcore_id].name += n; \
> >      } else {
> >          rte_atomic64_add( \
> >                  (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name),
> n);\
> >      } \
> > }
> You are supposed to use GCC C11 intrinsics (e.g.,
> __atomic_fetch_add()).

Ups. I forgot!

This should be emphasized everywhere in the rte_atomic library, to prevent such mistakes.

> 
> In addition: technically, you must use an atomic store for the EAL
> thread case (and an atomic load on the reader side), although there are
> tons of examples in DPDK where tearing is ignored. (The generated code
> will likely look the same.)

The counters are 64 bit aligned, but tearing could happen on 32 bit architectures.

My initial reaction to your comment was to do it correctly on the EAL threads too, to avoid tearing there too. However, there might be a performance cost for 32 bit architectures, so I will consider that these are only debug counters, and accept the risk of tearing.

> 
> DPDK coding conventions require there be no braces for a single
> statement.

Will fix.

> 
> Other than that, it looks good.
> 
> >
> > And the structure comment could be:
> >   * Plus one, for non-DPDK threads.
> >
> 
> "Unregistered non-EAL threads". This is the term the EAL documentation
> uses.

Thank you for clarifying. I didn't follow that discussion, so the new terminology for threads hasn't stuck with me yet. :-)


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-03  8:59           ` Morten Brørup
@ 2022-11-04  8:58             ` Mattias Rönnblom
  2022-11-04 10:01               ` Morten Brørup
  0 siblings, 1 reply; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-04  8:58 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

On 2022-11-03 09:59, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 2 November 2022 18.53
>>
>> On 2022-11-02 10:09, Morten Brørup wrote:
>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>> Sent: Wednesday, 2 November 2022 08.53
>>>>
>>>> On 2022-10-31 12:26, Morten Brørup wrote:
>>>>> Offset the stats array index by one, and count non-DPDK threads at
>>>> index
>>>>> zero.
>>>>>
>>>>> This patch provides two benefits:
>>>>> * Non-DPDK threads are also included in the statistics.
>>>>> * A conditional in the fast path is removed. Static branch
>> prediction
>>>> was
>>>>>      correct, so the performance improvement is negligible.
>>>>>
>>>>> v2:
>>>>> * New. No v1 of this patch in the series.
>>>>>
>>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>>>>> ---
>>>>>     lib/mempool/rte_mempool.c |  2 +-
>>>>>     lib/mempool/rte_mempool.h | 12 ++++++------
>>>>>     2 files changed, 7 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
>>>>> index 62d1ce764e..e6208125e0 100644
>>>>> --- a/lib/mempool/rte_mempool.c
>>>>> +++ b/lib/mempool/rte_mempool.c
>>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
>>>> *mp)
>>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>     	rte_mempool_ops_get_info(mp, &info);
>>>>>     	memset(&sum, 0, sizeof(sum));
>>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
>>>>>     		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>>>>>     		sum.put_objs += mp->stats[lcore_id].put_objs;
>>>>>     		sum.put_common_pool_bulk += mp-
>>>>> stats[lcore_id].put_common_pool_bulk;
>>>>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>>>>> index 9c4bf5549f..16e7e62e3c 100644
>>>>> --- a/lib/mempool/rte_mempool.h
>>>>> +++ b/lib/mempool/rte_mempool.h
>>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
>>>>>     	struct rte_mempool_memhdr_list mem_list; /**< List of
>> memory
>>>> chunks */
>>>>>
>>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>> -	/** Per-lcore statistics. */
>>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
>>>>> +	/** Per-lcore statistics.
>>>>> +	 *
>>>>> +	 * Offset by one, to include non-DPDK threads.
>>>>> +	 */
>>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>>>>>     #endif
>>>>>     }  __rte_cache_aligned;
>>>>>
>>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
>>>>>      */
>>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>     #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
>> \
>>>>> -		unsigned __lcore_id = rte_lcore_id();           \
>>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
>>>>> -			mp->stats[__lcore_id].name += n;        \
>>>>> -		}                                               \
>>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
>>>>
>>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
>> an
>>>> unregistered non-EAL thread? Might be worth a comment, or better a
>>>> rewrite with an explicit LCORE_ID_ANY comparison.
>>>
>>> The purpose of this patch is to avoid the comparison here.
>>>
>>> Yes, it relies on the wrap to zero, and these conditions:
>>> 1. LCORE_ID_ANY being UINT32_MAX, and
>>> 2. the return type of rte_lcore_id() being unsigned int, and
>>> 3. unsigned int being uint32_t.
>>>
>>> When I wrote this, I considered it safe to assume that LCORE_ID_ANY
>> will remain the unsigned equivalent of -1 using the return type of
>> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
>> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
>> accordingly to UINT64_MAX.
>>>
>>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
>> UINT32_MAX], but just [rte_lcore_id() + 1].
>>>
>>> I struggled writing an appropriate comment without making it
>> unacceptably long, but eventually gave up, and settled for the one-line
>> comment in the structure only.
>>>
>>>>
>>>> You anyways need a conditional. An atomic add must be used in the
>>>> unregistered EAL thread case.
>>>
>>> Good point: The "+= n" must be atomic for non-isolated threads.
>>>
>>
>> If the various unregistered non-EAL threads are run on isolated cores
>> or not does not matter.
> 
> Agree: They all share the same index, and thus may race, regardless which cores they are using.
> 
> Rephrasing: The "+= n" must be atomic for the unregistered non-EAL threads.
> 
>>
>>> I just took a look at how software maintained stats are handled
>> elsewhere, and the first thing I found, is the IOAT DMA driver, which
>> also seems to be using non-atomic increment [1] regardless if used by a
>> DPDK thread or not.
>>>
>>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
>> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
>>>
>>> However, doing it wrong elsewhere doesn't make it correct doing it
>> wrong here too. :-)
>>>
>>> Atomic increments are costly, so I would rather follow your
>> suggestion and reintroduce the comparison. How about:
>>>
>>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>>>       unsigned int __lcore_id = rte_lcore_id(); \
>>>       if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>>>           (mp)->stats[__lcore_id].name += n; \
>>>       } else {
>>>           rte_atomic64_add( \
>>>                   (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name),
>> n);\
>>>       } \
>>> }
>> You are supposed to use GCC C11 intrinsics (e.g.,
>> __atomic_fetch_add()).
> 
> Ups. I forgot!
> 
> This should be emphasized everywhere in the rte_atomic library, to prevent such mistakes.
> 
>>
>> In addition: technically, you must use an atomic store for the EAL
>> thread case (and an atomic load on the reader side), although there are
>> tons of examples in DPDK where tearing is ignored. (The generated code
>> will likely look the same.)
> 
> The counters are 64 bit aligned, but tearing could happen on 32 bit architectures.
> 

The compiler is free to do it on any architecture, but I'm not sure if 
it happens much in practice.

> My initial reaction to your comment was to do it correctly on the EAL threads too, to avoid tearing there too. However, there might be a performance cost for 32 bit architectures, so I will consider that these are only debug counters, and accept the risk of tearing.
> 

What would that cost consist of?

In theory C11 atomics could be implemented "in software" (i.e., without 
the proper ISA-level guarantees, with locks), but does any of the 
DPDK-supported compiler/32-bit ISAs actually do so?

It's also not obvious that if there, for a certain 
compiler/ISA-combination, is a performance impact to pay for 
correctness, correctness would have to give way.

>>
>> DPDK coding conventions require there be no braces for a single
>> statement.
> 
> Will fix.
> 
>>
>> Other than that, it looks good.
>>
>>>
>>> And the structure comment could be:
>>>    * Plus one, for non-DPDK threads.
>>>
>>
>> "Unregistered non-EAL threads". This is the term the EAL documentation
>> uses.
> 
> Thank you for clarifying. I didn't follow that discussion, so the new terminology for threads hasn't stuck with me yet. :-)
> 

^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-04  8:58             ` Mattias Rönnblom
@ 2022-11-04 10:01               ` Morten Brørup
  2022-11-07  7:26                 ` Mattias Rönnblom
  0 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 10:01 UTC (permalink / raw)
  To: Mattias Rönnblom, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Friday, 4 November 2022 09.59
> 
> On 2022-11-03 09:59, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Wednesday, 2 November 2022 18.53
> >>
> >> On 2022-11-02 10:09, Morten Brørup wrote:
> >>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >>>> Sent: Wednesday, 2 November 2022 08.53
> >>>>
> >>>> On 2022-10-31 12:26, Morten Brørup wrote:
> >>>>> Offset the stats array index by one, and count non-DPDK threads
> at
> >>>> index
> >>>>> zero.
> >>>>>
> >>>>> This patch provides two benefits:
> >>>>> * Non-DPDK threads are also included in the statistics.
> >>>>> * A conditional in the fast path is removed. Static branch
> >> prediction
> >>>> was
> >>>>>      correct, so the performance improvement is negligible.
> >>>>>
> >>>>> v2:
> >>>>> * New. No v1 of this patch in the series.
> >>>>>
> >>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> >>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >>>>> ---
> >>>>>     lib/mempool/rte_mempool.c |  2 +-
> >>>>>     lib/mempool/rte_mempool.h | 12 ++++++------
> >>>>>     2 files changed, 7 insertions(+), 7 deletions(-)
> >>>>>
> >>>>> diff --git a/lib/mempool/rte_mempool.c
> b/lib/mempool/rte_mempool.c
> >>>>> index 62d1ce764e..e6208125e0 100644
> >>>>> --- a/lib/mempool/rte_mempool.c
> >>>>> +++ b/lib/mempool/rte_mempool.c
> >>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
> rte_mempool
> >>>> *mp)
> >>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>     	rte_mempool_ops_get_info(mp, &info);
> >>>>>     	memset(&sum, 0, sizeof(sum));
> >>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> >>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
> lcore_id++) {
> >>>>>     		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >>>>>     		sum.put_objs += mp->stats[lcore_id].put_objs;
> >>>>>     		sum.put_common_pool_bulk += mp-
> >>>>> stats[lcore_id].put_common_pool_bulk;
> >>>>> diff --git a/lib/mempool/rte_mempool.h
> b/lib/mempool/rte_mempool.h
> >>>>> index 9c4bf5549f..16e7e62e3c 100644
> >>>>> --- a/lib/mempool/rte_mempool.h
> >>>>> +++ b/lib/mempool/rte_mempool.h
> >>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
> >>>>>     	struct rte_mempool_memhdr_list mem_list; /**< List of
> >> memory
> >>>> chunks */
> >>>>>
> >>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>> -	/** Per-lcore statistics. */
> >>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> >>>>> +	/** Per-lcore statistics.
> >>>>> +	 *
> >>>>> +	 * Offset by one, to include non-DPDK threads.
> >>>>> +	 */
> >>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >>>>>     #endif
> >>>>>     }  __rte_cache_aligned;
> >>>>>
> >>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
> >>>>>      */
> >>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>     #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> >> \
> >>>>> -		unsigned __lcore_id = rte_lcore_id();           \
> >>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> >>>>> -			mp->stats[__lcore_id].name += n;        \
> >>>>> -		}                                               \
> >>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> >>>>
> >>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
> >> an
> >>>> unregistered non-EAL thread? Might be worth a comment, or better a
> >>>> rewrite with an explicit LCORE_ID_ANY comparison.
> >>>
> >>> The purpose of this patch is to avoid the comparison here.
> >>>
> >>> Yes, it relies on the wrap to zero, and these conditions:
> >>> 1. LCORE_ID_ANY being UINT32_MAX, and
> >>> 2. the return type of rte_lcore_id() being unsigned int, and
> >>> 3. unsigned int being uint32_t.
> >>>
> >>> When I wrote this, I considered it safe to assume that LCORE_ID_ANY
> >> will remain the unsigned equivalent of -1 using the return type of
> >> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
> >> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> >> accordingly to UINT64_MAX.
> >>>
> >>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> >> UINT32_MAX], but just [rte_lcore_id() + 1].
> >>>
> >>> I struggled writing an appropriate comment without making it
> >> unacceptably long, but eventually gave up, and settled for the one-
> line
> >> comment in the structure only.
> >>>
> >>>>
> >>>> You anyways need a conditional. An atomic add must be used in the
> >>>> unregistered EAL thread case.
> >>>
> >>> Good point: The "+= n" must be atomic for non-isolated threads.
> >>>
> >>
> >> If the various unregistered non-EAL threads are run on isolated
> cores
> >> or not does not matter.
> >
> > Agree: They all share the same index, and thus may race, regardless
> which cores they are using.
> >
> > Rephrasing: The "+= n" must be atomic for the unregistered non-EAL
> threads.
> >
> >>
> >>> I just took a look at how software maintained stats are handled
> >> elsewhere, and the first thing I found, is the IOAT DMA driver,
> which
> >> also seems to be using non-atomic increment [1] regardless if used
> by a
> >> DPDK thread or not.
> >>>
> >>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
> >> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >>>
> >>> However, doing it wrong elsewhere doesn't make it correct doing it
> >> wrong here too. :-)
> >>>
> >>> Atomic increments are costly, so I would rather follow your
> >> suggestion and reintroduce the comparison. How about:
> >>>
> >>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >>>       unsigned int __lcore_id = rte_lcore_id(); \
> >>>       if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >>>           (mp)->stats[__lcore_id].name += n; \
> >>>       } else {
> >>>           rte_atomic64_add( \
> >>>                   (rte_atomic64_t*)&((mp)-
> >stats[RTE_MAX_LCORE].name),
> >> n);\
> >>>       } \
> >>> }
> >> You are supposed to use GCC C11 intrinsics (e.g.,
> >> __atomic_fetch_add()).
> >
> > Ups. I forgot!
> >
> > This should be emphasized everywhere in the rte_atomic library, to
> prevent such mistakes.
> >
> >>
> >> In addition: technically, you must use an atomic store for the EAL
> >> thread case (and an atomic load on the reader side), although there
> are
> >> tons of examples in DPDK where tearing is ignored. (The generated
> code
> >> will likely look the same.)
> >
> > The counters are 64 bit aligned, but tearing could happen on 32 bit
> architectures.
> >
> 
> The compiler is free to do it on any architecture, but I'm not sure if
> it happens much in practice.
> 
> > My initial reaction to your comment was to do it correctly on the EAL
> threads too, to avoid tearing there too. However, there might be a
> performance cost for 32 bit architectures, so I will consider that
> these are only debug counters, and accept the risk of tearing.
> >
> 
> What would that cost consist of?

I have tested it this morning on Godbolt: https://godbolt.org/z/fz7f3cv8b

ctr += n becomes:

        add     QWORD PTR ctr[rip], rdi

Whereas __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
 
        lock add        QWORD PTR ctr[rip], rdi

> 
> In theory C11 atomics could be implemented "in software" (i.e., without
> the proper ISA-level guarantees, with locks), but does any of the
> DPDK-supported compiler/32-bit ISAs actually do so?

Adding -m32 to the compiler options, ctr += n becomes:

        xor     edx, edx
        mov     eax, DWORD PTR [esp+4]
        add     DWORD PTR ctr, eax
        adc     DWORD PTR ctr+4, edx

And __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:

        push    edi
        xor     edi, edi
        push    esi
        push    ebx
        sub     esp, 8
        mov     esi, DWORD PTR [esp+24]
        mov     eax, DWORD PTR ctr
        mov     edx, DWORD PTR ctr+4
.L4:
        mov     ecx, eax
        mov     ebx, edx
        add     ecx, esi
        adc     ebx, edi
        mov     DWORD PTR [esp], ecx
        mov     DWORD PTR [esp+4], ebx
        mov     ebx, DWORD PTR [esp]
        mov     ecx, DWORD PTR [esp+4]
        lock cmpxchg8b  QWORD PTR ctr
        jne     .L4
        add     esp, 8
        pop     ebx
        pop     esi
        pop     edi

> 
> It's also not obvious that if there, for a certain
> compiler/ISA-combination, is a performance impact to pay for
> correctness, correctness would have to give way.

In this particular case, and because they are only debug counters, I will argue that DPDK generally uses non-atomic access to 64 bit statistics counters. This was also the case for these counters before this patch.

I am planning to finish v3 of this patch series today, so I hope you can agree to close the atomicity discussion with another argument: Making statistics counters atomic should be elevated to a general patch across DPDK, and not part of this mempool patch series.

The v3 patch (which I am working on right now) counts atomically for the unregistered non-EAL threads:

#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
		unsigned int __lcore_id = rte_lcore_id();                       \
		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
			(mp)->stats[__lcore_id].name += n;                      \
		else                                                            \
			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
					   n, __ATOMIC_RELAXED);                \
	} while (0)

PS: Excellent discussion - thank you for getting involved, Mattias.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v3 1/3] mempool: split stats from debug
  2022-10-31 11:26 ` [PATCH v2 1/3] " Morten Brørup
  2022-10-31 11:26   ` [PATCH v2 2/3] mempool: include non-DPDK threads in statistics Morten Brørup
  2022-10-31 11:26   ` [PATCH v2 3/3] mempool: use cache for frequently updated statistics Morten Brørup
@ 2022-11-04 11:17   ` Morten Brørup
  2022-11-04 11:17     ` [PATCH v3 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
                       ` (2 more replies)
  2 siblings, 3 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 11:17 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev, Morten Brørup

Split stats from debug, to make mempool statistics available without the
performance cost of continuously validating the debug cookies in the
mempool elements.

mempool_perf_autotest shows the following improvements in rate_persec.

The cost of enabling mempool debug without this patch:
-28.1 % and -74.0 %, respectively without and with cache.

The cost of enabling mempool stats (without debug) after this patch:
-5.8 % and -21.2 %, respectively without and with cache.

v3:
* Update the Programmer's Guide.
* Update the description of the RTE_MEMPOOL_STAT_ADD macro.
v2:
* Fix checkpatch warning:
  Use C style comments in rte_include.h, not C++ style.
* Do not rename the rte_mempool_debug_stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 config/rte_config.h                   |  2 ++
 doc/guides/prog_guide/mempool_lib.rst |  6 +++++-
 lib/mempool/rte_mempool.c             |  6 +++---
 lib/mempool/rte_mempool.h             | 10 +++++-----
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index ae56a86394..3c4876d434 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -47,6 +47,8 @@
 
 /* mempool defines */
 #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+/* RTE_LIBRTE_MEMPOOL_STATS is not set */
+/* RTE_LIBRTE_MEMPOOL_DEBUG is not set */
 
 /* mbuf defines */
 #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index 55838317b9..4f4ee33463 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -20,12 +20,16 @@ Cookies
 In debug mode, cookies are added at the beginning and end of allocated blocks.
 The allocated objects then contain overwrite protection fields to help debugging buffer overflows.
 
+Debug mode is disabled by default, but can be enabled by setting ``RTE_LIBRTE_MEMPOOL_DEBUG`` in ``config/rte_config.h``.
+
 Stats
 -----
 
-In debug mode, statistics about get from/put in the pool are stored in the mempool structure.
+In stats mode, statistics about get from/put in the pool are stored in the mempool structure.
 Statistics are per-lcore to avoid concurrent access to statistics counters.
 
+Stats mode is disabled by default, but can be enabled by setting ``RTE_LIBRTE_MEMPOOL_STATS`` in ``config/rte_config.h``.
+
 Memory Alignment Constraints on x86 architecture
 ------------------------------------------------
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..62d1ce764e 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -818,7 +818,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
@@ -1221,7 +1221,7 @@ rte_mempool_audit(struct rte_mempool *mp)
 void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	struct rte_mempool_info info;
 	struct rte_mempool_debug_stats sum;
 	unsigned lcore_id;
@@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "  common_pool_count=%u\n", common_count);
 
 	/* sum and dump statistics */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..2afe332097 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -56,7 +56,7 @@ extern "C" {
 #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header cookie. */
 #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer cookie.*/
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 /**
  * A structure that stores the mempool statistics (per-lcore).
  * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are not
@@ -237,7 +237,7 @@ struct rte_mempool {
 	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	/** Per-lcore statistics. */
 	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
 #endif
@@ -293,16 +293,16 @@ struct rte_mempool {
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats is enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
 		unsigned __lcore_id = rte_lcore_id();           \
 		if (__lcore_id < RTE_MAX_LCORE) {               \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v3 2/3] mempool: add stats for unregistered non-EAL threads
  2022-11-04 11:17   ` [PATCH v3 1/3] mempool: split stats from debug Morten Brørup
@ 2022-11-04 11:17     ` Morten Brørup
  2022-11-04 11:17     ` [PATCH v3 3/3] mempool: use cache for frequently updated stats Morten Brørup
  2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
  2 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 11:17 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev, Morten Brørup

This patch adds statistics for unregistered non-EAL threads, which was
previously not included in the statistics.

Add one more entry to the stats array, and use the last index for
unregistered non-EAL threads.

The unregistered non-EAL thread statistics are incremented atomically.

In theory, the EAL thread counters should also be accessed atomically to
avoid tearing on 32 bit architectures. However, it was decided to avoid
the performance cost of using atomic operations, because:
1. these are debug counters, and
2. statistics counters in DPDK are usually incremented non-atomically.

v3 (feedback from Mattias Rönnblom):
* Use correct terminology: Unregistered non-EAL threads.
* Use atomic counting for the unregistered non-EAL threads.
* Reintroduce the conditional instead of offsetting the index by one.
v2:
* New. No v1 of this patch in the series.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  2 +-
 lib/mempool/rte_mempool.h | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 62d1ce764e..e6208125e0 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
 		sum.put_objs += mp->stats[lcore_id].put_objs;
 		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 2afe332097..de6fceac5e 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -238,8 +238,11 @@ struct rte_mempool {
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	/** Per-lcore statistics.
+	 *
+	 * Plus one, for unregistered non-EAL threads.
+	 */
+	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -303,11 +306,13 @@ struct rte_mempool {
  *   Number to add to the statistics.
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-		unsigned __lcore_id = rte_lcore_id();           \
-		if (__lcore_id < RTE_MAX_LCORE) {               \
-			mp->stats[__lcore_id].name += n;        \
-		}                                               \
+#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
+		unsigned int __lcore_id = rte_lcore_id();                       \
+		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
+			(mp)->stats[__lcore_id].name += n;                      \
+		else                                                            \
+			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
+					   n, __ATOMIC_RELAXED);                \
 	} while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v3 3/3] mempool: use cache for frequently updated stats
  2022-11-04 11:17   ` [PATCH v3 1/3] mempool: split stats from debug Morten Brørup
  2022-11-04 11:17     ` [PATCH v3 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
@ 2022-11-04 11:17     ` Morten Brørup
  2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
  2 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 11:17 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev, Morten Brørup

When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
performance of mempools with caches is improved as follows.

When accessing objects in the mempool, either the put_bulk and put_objs or
the get_success_bulk and get_success_objs statistics counters are likely
to be incremented.

By adding an alternative set of these counters to the mempool cache
structure, accesing the dedicated statistics structure is avoided in the
likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
Updating some statistics counters in the same cache line has lower
performance cost than accessing the statistics counters in the dedicated
statistics structure, which resides in another cache line.

mempool_perf_autotest with this patch shows the following improvements in
rate_persec.

The cost of enabling mempool stats (without debug) after this patch:
-6.8 % and -6.7 %, respectively without and with cache.

v3:
* Don't update the the description of the RTE_MEMPOOL_STAT_ADD macro.
  This change belongs in the first patch of the series.
v2:
* Move the statistics counters into a stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  9 ++++++
 lib/mempool/rte_mempool.h | 67 ++++++++++++++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index e6208125e0..a18e39af04 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
+	if (mp->cache_size != 0) {
+		/* Add the statistics stored in the mempool caches. */
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
+		}
+	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
 	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index de6fceac5e..b7ba2542dd 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,19 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	uint32_t unused;
+	/*
+	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
+	 * providing faster update access when using a mempool cache.
+	 */
+	struct {
+		uint64_t put_bulk;          /**< Number of puts. */
+		uint64_t put_objs;          /**< Number of objects successfully put. */
+		uint64_t get_success_bulk;  /**< Successful allocation number. */
+		uint64_t get_success_objs;  /**< Objects successfully allocated. */
+	} stats;                        /**< Statistics */
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -318,6 +331,24 @@ struct rte_mempool {
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
 #endif
 
+/**
+ * @internal When stats is enabled, store some statistics.
+ *
+ * @param cache
+ *   Pointer to the memory pool cache.
+ * @param name
+ *   Name of the statistics field to increment in the memory pool cache.
+ * @param n
+ *   Number to add to the statistics.
+ */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do { \
+		(cache)->stats.name += n;               \
+	} while (0)
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif
+
 /**
  * @internal Calculate the size of the mempool header.
  *
@@ -1332,13 +1363,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request itself is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1363,6 +1398,11 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1469,8 +1509,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 		return 0;
 	}
@@ -1499,8 +1539,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
@@ -1522,8 +1562,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		if (likely(cache != NULL)) {
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		} else {
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		}
 	}
 
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v4 1/3] mempool: split stats from debug
  2022-11-04 11:17   ` [PATCH v3 1/3] mempool: split stats from debug Morten Brørup
  2022-11-04 11:17     ` [PATCH v3 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
  2022-11-04 11:17     ` [PATCH v3 3/3] mempool: use cache for frequently updated stats Morten Brørup
@ 2022-11-04 12:03     ` Morten Brørup
  2022-11-04 12:03       ` [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
                         ` (3 more replies)
  2 siblings, 4 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 12:03 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev, Morten Brørup

Split stats from debug, to make mempool statistics available without the
performance cost of continuously validating the debug cookies in the
mempool elements.

mempool_perf_autotest shows the following improvements in rate_persec.

The cost of enabling mempool debug without this patch:
-28.1 % and -74.0 %, respectively without and with cache.

The cost of enabling mempool stats (without debug) after this patch:
-5.8 % and -21.2 %, respectively without and with cache.

v4:
* No changes.
v3:
* Update the Programmer's Guide.
* Update the description of the RTE_MEMPOOL_STAT_ADD macro.
v2:
* Fix checkpatch warning:
  Use C style comments in rte_include.h, not C++ style.
* Do not rename the rte_mempool_debug_stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 config/rte_config.h                   |  2 ++
 doc/guides/prog_guide/mempool_lib.rst |  6 +++++-
 lib/mempool/rte_mempool.c             |  6 +++---
 lib/mempool/rte_mempool.h             | 11 ++++++-----
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index ae56a86394..3c4876d434 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -47,6 +47,8 @@
 
 /* mempool defines */
 #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+/* RTE_LIBRTE_MEMPOOL_STATS is not set */
+/* RTE_LIBRTE_MEMPOOL_DEBUG is not set */
 
 /* mbuf defines */
 #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index 55838317b9..4f4ee33463 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -20,12 +20,16 @@ Cookies
 In debug mode, cookies are added at the beginning and end of allocated blocks.
 The allocated objects then contain overwrite protection fields to help debugging buffer overflows.
 
+Debug mode is disabled by default, but can be enabled by setting ``RTE_LIBRTE_MEMPOOL_DEBUG`` in ``config/rte_config.h``.
+
 Stats
 -----
 
-In debug mode, statistics about get from/put in the pool are stored in the mempool structure.
+In stats mode, statistics about get from/put in the pool are stored in the mempool structure.
 Statistics are per-lcore to avoid concurrent access to statistics counters.
 
+Stats mode is disabled by default, but can be enabled by setting ``RTE_LIBRTE_MEMPOOL_STATS`` in ``config/rte_config.h``.
+
 Memory Alignment Constraints on x86 architecture
 ------------------------------------------------
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..62d1ce764e 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -818,7 +818,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
@@ -1221,7 +1221,7 @@ rte_mempool_audit(struct rte_mempool *mp)
 void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	struct rte_mempool_info info;
 	struct rte_mempool_debug_stats sum;
 	unsigned lcore_id;
@@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "  common_pool_count=%u\n", common_count);
 
 	/* sum and dump statistics */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..d5f7ea99fa 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -56,7 +56,7 @@ extern "C" {
 #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header cookie. */
 #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer cookie.*/
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 /**
  * A structure that stores the mempool statistics (per-lcore).
  * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are not
@@ -237,7 +237,7 @@ struct rte_mempool {
 	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	/** Per-lcore statistics. */
 	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
 #endif
@@ -292,17 +292,18 @@ struct rte_mempool {
 	| RTE_MEMPOOL_F_SC_GET \
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
+
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats is enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
 		unsigned __lcore_id = rte_lcore_id();           \
 		if (__lcore_id < RTE_MAX_LCORE) {               \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads
  2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
@ 2022-11-04 12:03       ` Morten Brørup
  2022-11-06 11:34         ` Andrew Rybchenko
  2022-11-04 12:03       ` [PATCH v4 3/3] mempool: use cache for frequently updated stats Morten Brørup
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 12:03 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev, Morten Brørup

This patch adds statistics for unregistered non-EAL threads, which was
previously not included in the statistics.

Add one more entry to the stats array, and use the last index for
unregistered non-EAL threads.

The unregistered non-EAL thread statistics are incremented atomically.

In theory, the EAL thread counters should also be accessed atomically to
avoid tearing on 32 bit architectures. However, it was decided to avoid
the performance cost of using atomic operations, because:
1. these are debug counters, and
2. statistics counters in DPDK are usually incremented non-atomically.

v4:
* No changes.
v3 (feedback from Mattias Rönnblom):
* Use correct terminology: Unregistered non-EAL threads.
* Use atomic counting for the unregistered non-EAL threads.
* Reintroduce the conditional instead of offsetting the index by one.
v2:
* New. No v1 of this patch in the series.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  2 +-
 lib/mempool/rte_mempool.h | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 62d1ce764e..e6208125e0 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
 		sum.put_objs += mp->stats[lcore_id].put_objs;
 		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index d5f7ea99fa..abfe34c05f 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -238,8 +238,11 @@ struct rte_mempool {
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	/** Per-lcore statistics.
+	 *
+	 * Plus one, for unregistered non-EAL threads.
+	 */
+	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -304,11 +307,13 @@ struct rte_mempool {
  *   Number to add to the statistics.
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-		unsigned __lcore_id = rte_lcore_id();           \
-		if (__lcore_id < RTE_MAX_LCORE) {               \
-			mp->stats[__lcore_id].name += n;        \
-		}                                               \
+#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
+		unsigned int __lcore_id = rte_lcore_id();                       \
+		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
+			(mp)->stats[__lcore_id].name += n;                      \
+		else                                                            \
+			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
+					   n, __ATOMIC_RELAXED);                \
 	} while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
  2022-11-04 12:03       ` [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
@ 2022-11-04 12:03       ` Morten Brørup
  2022-11-06 11:40         ` Andrew Rybchenko
                           ` (2 more replies)
  2022-11-06 11:32       ` [PATCH v4 1/3] mempool: split stats from debug Andrew Rybchenko
  2022-11-09 18:18       ` [PATCH v5 " Morten Brørup
  3 siblings, 3 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-04 12:03 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev, Morten Brørup

When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
performance of mempools with caches is improved as follows.

When accessing objects in the mempool, either the put_bulk and put_objs or
the get_success_bulk and get_success_objs statistics counters are likely
to be incremented.

By adding an alternative set of these counters to the mempool cache
structure, accessing the dedicated statistics structure is avoided in the
likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
Updating some statistics counters in the same cache line has lower
performance cost than accessing the statistics counters in the dedicated
statistics structure, which resides in another cache line.

mempool_perf_autotest with this patch shows the following improvements in
rate_persec.

The cost of enabling mempool stats (without debug) after this patch:
-6.8 % and -6.7 %, respectively without and with cache.

v4:
* Fix checkpatch warnings:
  A couple of typos in the patch description.
  The macro to add to a mempool cache stat variable should not use
  do {} while (0). Personally, I would tend to disagree with this, but
  whatever keeps the CI happy.
v3:
* Don't update the description of the RTE_MEMPOOL_STAT_ADD macro.
  This change belongs in the first patch of the series.
v2:
* Move the statistics counters into a stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  9 ++++++
 lib/mempool/rte_mempool.h | 66 ++++++++++++++++++++++++++++++++-------
 2 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index e6208125e0..a18e39af04 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
+	if (mp->cache_size != 0) {
+		/* Add the statistics stored in the mempool caches. */
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
+		}
+	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
 	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index abfe34c05f..e6eb573739 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -86,6 +86,19 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	uint32_t unused;
+	/*
+	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
+	 * providing faster update access when using a mempool cache.
+	 */
+	struct {
+		uint64_t put_bulk;          /**< Number of puts. */
+		uint64_t put_objs;          /**< Number of objects successfully put. */
+		uint64_t get_success_bulk;  /**< Successful allocation number. */
+		uint64_t get_success_objs;  /**< Objects successfully allocated. */
+	} stats;                        /**< Statistics */
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -319,6 +332,22 @@ struct rte_mempool {
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
 #endif
 
+/**
+ * @internal When stats is enabled, store some statistics.
+ *
+ * @param cache
+ *   Pointer to the memory pool cache.
+ * @param name
+ *   Name of the statistics field to increment in the memory pool cache.
+ * @param n
+ *   Number to add to the statistics.
+ */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)->stats.name += n
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif
+
 /**
  * @internal Calculate the size of the mempool header.
  *
@@ -1333,13 +1362,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request itself is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1364,6 +1397,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1470,8 +1509,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 		return 0;
 	}
@@ -1500,8 +1539,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
@@ -1523,8 +1562,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		if (likely(cache != NULL)) {
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		} else {
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		}
 	}
 
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v4 1/3] mempool: split stats from debug
  2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
  2022-11-04 12:03       ` [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
  2022-11-04 12:03       ` [PATCH v4 3/3] mempool: use cache for frequently updated stats Morten Brørup
@ 2022-11-06 11:32       ` Andrew Rybchenko
  2022-11-09 18:18       ` [PATCH v5 " Morten Brørup
  3 siblings, 0 replies; 39+ messages in thread
From: Andrew Rybchenko @ 2022-11-06 11:32 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev

On 11/4/22 15:03, Morten Brørup wrote:
> Split stats from debug, to make mempool statistics available without the
> performance cost of continuously validating the debug cookies in the
> mempool elements.
> 
> mempool_perf_autotest shows the following improvements in rate_persec.
> 
> The cost of enabling mempool debug without this patch:
> -28.1 % and -74.0 %, respectively without and with cache.
> 
> The cost of enabling mempool stats (without debug) after this patch:
> -5.8 % and -21.2 %, respectively without and with cache.
> 
> v4:
> * No changes.
> v3:
> * Update the Programmer's Guide.
> * Update the description of the RTE_MEMPOOL_STAT_ADD macro.
> v2:
> * Fix checkpatch warning:
>    Use C style comments in rte_include.h, not C++ style.
> * Do not rename the rte_mempool_debug_stats structure.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>

Good idea, many thanks for the patch.

Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads
  2022-11-04 12:03       ` [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
@ 2022-11-06 11:34         ` Andrew Rybchenko
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Rybchenko @ 2022-11-06 11:34 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev

On 11/4/22 15:03, Morten Brørup wrote:
> This patch adds statistics for unregistered non-EAL threads, which was
> previously not included in the statistics.
> 
> Add one more entry to the stats array, and use the last index for
> unregistered non-EAL threads.
> 
> The unregistered non-EAL thread statistics are incremented atomically.
> 
> In theory, the EAL thread counters should also be accessed atomically to
> avoid tearing on 32 bit architectures. However, it was decided to avoid
> the performance cost of using atomic operations, because:
> 1. these are debug counters, and
> 2. statistics counters in DPDK are usually incremented non-atomically.
> 
> v4:
> * No changes.
> v3 (feedback from Mattias Rönnblom):
> * Use correct terminology: Unregistered non-EAL threads.
> * Use atomic counting for the unregistered non-EAL threads.
> * Reintroduce the conditional instead of offsetting the index by one.
> v2:
> * New. No v1 of this patch in the series.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>

Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-04 12:03       ` [PATCH v4 3/3] mempool: use cache for frequently updated stats Morten Brørup
@ 2022-11-06 11:40         ` Andrew Rybchenko
  2022-11-06 11:50           ` Morten Brørup
  2022-11-07  7:30         ` Mattias Rönnblom
  2022-11-08  9:20         ` Konstantin Ananyev
  2 siblings, 1 reply; 39+ messages in thread
From: Andrew Rybchenko @ 2022-11-06 11:40 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev

On 11/4/22 15:03, Morten Brørup wrote:
> When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> performance of mempools with caches is improved as follows.
> 
> When accessing objects in the mempool, either the put_bulk and put_objs or
> the get_success_bulk and get_success_objs statistics counters are likely
> to be incremented.
> 
> By adding an alternative set of these counters to the mempool cache
> structure, accessing the dedicated statistics structure is avoided in the
> likely cases where these counters are incremented.
> 
> The trick here is that the cache line holding the mempool cache structure
> is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
> Updating some statistics counters in the same cache line has lower
> performance cost than accessing the statistics counters in the dedicated
> statistics structure, which resides in another cache line.
> 
> mempool_perf_autotest with this patch shows the following improvements in
> rate_persec.
> 
> The cost of enabling mempool stats (without debug) after this patch:
> -6.8 % and -6.7 %, respectively without and with cache.
> 
> v4:
> * Fix checkpatch warnings:
>    A couple of typos in the patch description.
>    The macro to add to a mempool cache stat variable should not use
>    do {} while (0). Personally, I would tend to disagree with this, but
>    whatever keeps the CI happy.
> v3:
> * Don't update the description of the RTE_MEMPOOL_STAT_ADD macro.
>    This change belongs in the first patch of the series.
> v2:
> * Move the statistics counters into a stats structure.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>

Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>

[snip]

> +/**
> + * @internal When stats is enabled, store some statistics.
> + *
> + * @param cache
> + *   Pointer to the memory pool cache.
> + * @param name
> + *   Name of the statistics field to increment in the memory pool cache.
> + * @param n
> + *   Number to add to the statistics.
> + */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)->stats.name += n

I'd enclose it in parenthesis.

> +#else
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
> +#endif
> +



^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-06 11:40         ` Andrew Rybchenko
@ 2022-11-06 11:50           ` Morten Brørup
  2022-11-06 11:59             ` Andrew Rybchenko
  0 siblings, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-11-06 11:50 UTC (permalink / raw)
  To: Andrew Rybchenko, olivier.matz, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev

> From: Andrew Rybchenko [mailto:andrew.rybchenko@oktetlabs.ru]
> Sent: Sunday, 6 November 2022 12.41
> 
> On 11/4/22 15:03, Morten Brørup wrote:

[...]

> > +/**
> > + * @internal When stats is enabled, store some statistics.
> > + *
> > + * @param cache
> > + *   Pointer to the memory pool cache.
> > + * @param name
> > + *   Name of the statistics field to increment in the memory pool
> cache.
> > + * @param n
> > + *   Number to add to the statistics.
> > + */
> > +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> > +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)-
> >stats.name += n
> 
> I'd enclose it in parenthesis.

Me too! I had it surrounded by "do {...} while (0)" in v3, but checkpatch complained about it [1], so I changed it to the above. Which checkpatch also complains about. :-(

[1]: http://mails.dpdk.org/archives/test-report/2022-November/321316.html

Feel free to modify this macro at your preference when merging!

> 
> > +#else
> > +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
> > +#endif
> > +


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-06 11:50           ` Morten Brørup
@ 2022-11-06 11:59             ` Andrew Rybchenko
  2022-11-06 12:16               ` Morten Brørup
  0 siblings, 1 reply; 39+ messages in thread
From: Andrew Rybchenko @ 2022-11-06 11:59 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev

On 11/6/22 14:50, Morten Brørup wrote:
>> From: Andrew Rybchenko [mailto:andrew.rybchenko@oktetlabs.ru]
>> Sent: Sunday, 6 November 2022 12.41
>>
>> On 11/4/22 15:03, Morten Brørup wrote:
> 
> [...]
> 
>>> +/**
>>> + * @internal When stats is enabled, store some statistics.
>>> + *
>>> + * @param cache
>>> + *   Pointer to the memory pool cache.
>>> + * @param name
>>> + *   Name of the statistics field to increment in the memory pool
>> cache.
>>> + * @param n
>>> + *   Number to add to the statistics.
>>> + */
>>> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
>>> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)-
>>> stats.name += n
>>
>> I'd enclose it in parenthesis.
> 
> Me too! I had it surrounded by "do {...} while (0)" in v3, but checkpatch complained about it [1], so I changed it to the above. Which checkpatch also complains about. :-(

I mean
#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) \
  ((cache)->stats.name += (n))

> 
> [1]: http://mails.dpdk.org/archives/test-report/2022-November/321316.html

Yes, I've seen it.

> 
> Feel free to modify this macro at your preference when merging!
> 
>>
>>> +#else
>>> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
>>> +#endif
>>> +
> 


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-06 11:59             ` Andrew Rybchenko
@ 2022-11-06 12:16               ` Morten Brørup
  0 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-06 12:16 UTC (permalink / raw)
  To: Andrew Rybchenko, olivier.matz, mattias.ronnblom, stephen,
	jerinj, bruce.richardson
  Cc: hofors, thomas, dev

> From: Andrew Rybchenko [mailto:andrew.rybchenko@oktetlabs.ru]
> Sent: Sunday, 6 November 2022 13.00
> 
> On 11/6/22 14:50, Morten Brørup wrote:
> >> From: Andrew Rybchenko [mailto:andrew.rybchenko@oktetlabs.ru]
> >> Sent: Sunday, 6 November 2022 12.41
> >>
> >> On 11/4/22 15:03, Morten Brørup wrote:
> >
> > [...]
> >
> >>> +/**
> >>> + * @internal When stats is enabled, store some statistics.
> >>> + *
> >>> + * @param cache
> >>> + *   Pointer to the memory pool cache.
> >>> + * @param name
> >>> + *   Name of the statistics field to increment in the memory pool
> >> cache.
> >>> + * @param n
> >>> + *   Number to add to the statistics.
> >>> + */
> >>> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)-
> >>> stats.name += n
> >>
> >> I'd enclose it in parenthesis.
> >
> > Me too! I had it surrounded by "do {...} while (0)" in v3, but
> checkpatch complained about it [1], so I changed it to the above. Which
> checkpatch also complains about. :-(
> 
> I mean
> #define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) \
>   ((cache)->stats.name += (n))

Thank you for elaborating, Andrew. I guess this is exactly what the checkpatch error message for v4 [2] says - I just couldn't understand what it wanted me to do.

[2]: http://mails.dpdk.org/archives/test-report/2022-November/321345.html

I don't always agree with checkpatch, but now Thomas has a third option for how to write this macro when merging. :-)

> 
> >
> > [1]: http://mails.dpdk.org/archives/test-report/2022-
> November/321316.html
> 
> Yes, I've seen it.
> 
> >
> > Feel free to modify this macro at your preference when merging!
> >
> >>
> >>> +#else
> >>> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
> >>> +#endif
> >>> +


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-04 10:01               ` Morten Brørup
@ 2022-11-07  7:26                 ` Mattias Rönnblom
  2022-11-07  8:56                   ` Morten Brørup
  0 siblings, 1 reply; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-07  7:26 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

On 2022-11-04 11:01, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Friday, 4 November 2022 09.59
>>
>> On 2022-11-03 09:59, Morten Brørup wrote:
>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>> Sent: Wednesday, 2 November 2022 18.53
>>>>
>>>> On 2022-11-02 10:09, Morten Brørup wrote:
>>>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>>>> Sent: Wednesday, 2 November 2022 08.53
>>>>>>
>>>>>> On 2022-10-31 12:26, Morten Brørup wrote:
>>>>>>> Offset the stats array index by one, and count non-DPDK threads
>> at
>>>>>> index
>>>>>>> zero.
>>>>>>>
>>>>>>> This patch provides two benefits:
>>>>>>> * Non-DPDK threads are also included in the statistics.
>>>>>>> * A conditional in the fast path is removed. Static branch
>>>> prediction
>>>>>> was
>>>>>>>       correct, so the performance improvement is negligible.
>>>>>>>
>>>>>>> v2:
>>>>>>> * New. No v1 of this patch in the series.
>>>>>>>
>>>>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>>>>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>>>>>>> ---
>>>>>>>      lib/mempool/rte_mempool.c |  2 +-
>>>>>>>      lib/mempool/rte_mempool.h | 12 ++++++------
>>>>>>>      2 files changed, 7 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/lib/mempool/rte_mempool.c
>> b/lib/mempool/rte_mempool.c
>>>>>>> index 62d1ce764e..e6208125e0 100644
>>>>>>> --- a/lib/mempool/rte_mempool.c
>>>>>>> +++ b/lib/mempool/rte_mempool.c
>>>>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
>> rte_mempool
>>>>>> *mp)
>>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>>>      	rte_mempool_ops_get_info(mp, &info);
>>>>>>>      	memset(&sum, 0, sizeof(sum));
>>>>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>>>>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
>> lcore_id++) {
>>>>>>>      		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>>>>>>>      		sum.put_objs += mp->stats[lcore_id].put_objs;
>>>>>>>      		sum.put_common_pool_bulk += mp-
>>>>>>> stats[lcore_id].put_common_pool_bulk;
>>>>>>> diff --git a/lib/mempool/rte_mempool.h
>> b/lib/mempool/rte_mempool.h
>>>>>>> index 9c4bf5549f..16e7e62e3c 100644
>>>>>>> --- a/lib/mempool/rte_mempool.h
>>>>>>> +++ b/lib/mempool/rte_mempool.h
>>>>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
>>>>>>>      	struct rte_mempool_memhdr_list mem_list; /**< List of
>>>> memory
>>>>>> chunks */
>>>>>>>
>>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>>> -	/** Per-lcore statistics. */
>>>>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
>>>>>>> +	/** Per-lcore statistics.
>>>>>>> +	 *
>>>>>>> +	 * Offset by one, to include non-DPDK threads.
>>>>>>> +	 */
>>>>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>>>>>>>      #endif
>>>>>>>      }  __rte_cache_aligned;
>>>>>>>
>>>>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
>>>>>>>       */
>>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>>>      #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
>>>> \
>>>>>>> -		unsigned __lcore_id = rte_lcore_id();           \
>>>>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
>>>>>>> -			mp->stats[__lcore_id].name += n;        \
>>>>>>> -		}                                               \
>>>>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
>>>>>>
>>>>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
>>>> an
>>>>>> unregistered non-EAL thread? Might be worth a comment, or better a
>>>>>> rewrite with an explicit LCORE_ID_ANY comparison.
>>>>>
>>>>> The purpose of this patch is to avoid the comparison here.
>>>>>
>>>>> Yes, it relies on the wrap to zero, and these conditions:
>>>>> 1. LCORE_ID_ANY being UINT32_MAX, and
>>>>> 2. the return type of rte_lcore_id() being unsigned int, and
>>>>> 3. unsigned int being uint32_t.
>>>>>
>>>>> When I wrote this, I considered it safe to assume that LCORE_ID_ANY
>>>> will remain the unsigned equivalent of -1 using the return type of
>>>> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
>>>> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
>>>> accordingly to UINT64_MAX.
>>>>>
>>>>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
>>>> UINT32_MAX], but just [rte_lcore_id() + 1].
>>>>>
>>>>> I struggled writing an appropriate comment without making it
>>>> unacceptably long, but eventually gave up, and settled for the one-
>> line
>>>> comment in the structure only.
>>>>>
>>>>>>
>>>>>> You anyways need a conditional. An atomic add must be used in the
>>>>>> unregistered EAL thread case.
>>>>>
>>>>> Good point: The "+= n" must be atomic for non-isolated threads.
>>>>>
>>>>
>>>> If the various unregistered non-EAL threads are run on isolated
>> cores
>>>> or not does not matter.
>>>
>>> Agree: They all share the same index, and thus may race, regardless
>> which cores they are using.
>>>
>>> Rephrasing: The "+= n" must be atomic for the unregistered non-EAL
>> threads.
>>>
>>>>
>>>>> I just took a look at how software maintained stats are handled
>>>> elsewhere, and the first thing I found, is the IOAT DMA driver,
>> which
>>>> also seems to be using non-atomic increment [1] regardless if used
>> by a
>>>> DPDK thread or not.
>>>>>
>>>>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
>>>> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
>>>>>
>>>>> However, doing it wrong elsewhere doesn't make it correct doing it
>>>> wrong here too. :-)
>>>>>
>>>>> Atomic increments are costly, so I would rather follow your
>>>> suggestion and reintroduce the comparison. How about:
>>>>>
>>>>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>>>>>        unsigned int __lcore_id = rte_lcore_id(); \
>>>>>        if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>>>>>            (mp)->stats[__lcore_id].name += n; \
>>>>>        } else {
>>>>>            rte_atomic64_add( \
>>>>>                    (rte_atomic64_t*)&((mp)-
>>> stats[RTE_MAX_LCORE].name),
>>>> n);\
>>>>>        } \
>>>>> }
>>>> You are supposed to use GCC C11 intrinsics (e.g.,
>>>> __atomic_fetch_add()).
>>>
>>> Ups. I forgot!
>>>
>>> This should be emphasized everywhere in the rte_atomic library, to
>> prevent such mistakes.
>>>
>>>>
>>>> In addition: technically, you must use an atomic store for the EAL
>>>> thread case (and an atomic load on the reader side), although there
>> are
>>>> tons of examples in DPDK where tearing is ignored. (The generated
>> code
>>>> will likely look the same.)
>>>
>>> The counters are 64 bit aligned, but tearing could happen on 32 bit
>> architectures.
>>>
>>
>> The compiler is free to do it on any architecture, but I'm not sure if
>> it happens much in practice.
>>
>>> My initial reaction to your comment was to do it correctly on the EAL
>> threads too, to avoid tearing there too. However, there might be a
>> performance cost for 32 bit architectures, so I will consider that
>> these are only debug counters, and accept the risk of tearing.
>>>
>>
>> What would that cost consist of?
> 
> I have tested it this morning on Godbolt: https://godbolt.org/z/fz7f3cv8b
> 
> ctr += n becomes:
> 
>          add     QWORD PTR ctr[rip], rdi
> 
> Whereas __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
>   
>          lock add        QWORD PTR ctr[rip], rdi
> 

Since there is a single writer/producer, only the store need be atomic, 
not the complete load+add+store sequence.

__atomic_store_n(&ctr, ctr + 1, __ATOMIC_RELAXED);

Such a construct will not result in a lock add instruction.

I've happened to finished a draft version of a chapter on this topic, in 
the data plane book I'm writing:
https://ericsson.github.io/dataplanebook/stats/stats.html#per-core-counters

I will add a section with this "add an extra instance to deal with 
unregistered non-EAL threads" approach taken in your patch.

>>
>> In theory C11 atomics could be implemented "in software" (i.e., without
>> the proper ISA-level guarantees, with locks), but does any of the
>> DPDK-supported compiler/32-bit ISAs actually do so?
> 
> Adding -m32 to the compiler options, ctr += n becomes:
> 
>          xor     edx, edx
>          mov     eax, DWORD PTR [esp+4]
>          add     DWORD PTR ctr, eax
>          adc     DWORD PTR ctr+4, edx
> 
> And __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
> 
>          push    edi
>          xor     edi, edi
>          push    esi
>          push    ebx
>          sub     esp, 8
>          mov     esi, DWORD PTR [esp+24]
>          mov     eax, DWORD PTR ctr
>          mov     edx, DWORD PTR ctr+4
> .L4:
>          mov     ecx, eax
>          mov     ebx, edx
>          add     ecx, esi
>          adc     ebx, edi
>          mov     DWORD PTR [esp], ecx
>          mov     DWORD PTR [esp+4], ebx
>          mov     ebx, DWORD PTR [esp]
>          mov     ecx, DWORD PTR [esp+4]
>          lock cmpxchg8b  QWORD PTR ctr
>          jne     .L4
>          add     esp, 8
>          pop     ebx
>          pop     esi
>          pop     edi
> 
>>
>> It's also not obvious that if there, for a certain
>> compiler/ISA-combination, is a performance impact to pay for
>> correctness, correctness would have to give way.
> 
> In this particular case, and because they are only debug counters, I will argue that DPDK generally uses non-atomic access to 64 bit statistics counters. This was also the case for these counters before this patch.
> 
> I am planning to finish v3 of this patch series today, so I hope you can agree to close the atomicity discussion with another argument: Making statistics counters atomic should be elevated to a general patch across DPDK, and not part of this mempool patch series.
> 

As I think I indicated, I think this is a minor issue. I'm OK with the 
code as-written.

> The v3 patch (which I am working on right now) counts atomically for the unregistered non-EAL threads:
> 
> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
> 		unsigned int __lcore_id = rte_lcore_id();                       \
> 		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
> 			(mp)->stats[__lcore_id].name += n;                      \
> 		else                                                            \
> 			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
> 					   n, __ATOMIC_RELAXED);                \
> 	} while (0)
> 
> PS: Excellent discussion - thank you for getting involved, Mattias.
> 

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-04 12:03       ` [PATCH v4 3/3] mempool: use cache for frequently updated stats Morten Brørup
  2022-11-06 11:40         ` Andrew Rybchenko
@ 2022-11-07  7:30         ` Mattias Rönnblom
  2022-11-08  9:20         ` Konstantin Ananyev
  2 siblings, 0 replies; 39+ messages in thread
From: Mattias Rönnblom @ 2022-11-07  7:30 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko,
	mattias.ronnblom, stephen, jerinj, bruce.richardson
  Cc: thomas, dev

On 2022-11-04 13:03, Morten Brørup wrote:
> When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> performance of mempools with caches is improved as follows.
> 
> When accessing objects in the mempool, either the put_bulk and put_objs or
> the get_success_bulk and get_success_objs statistics counters are likely
> to be incremented.
> 
> By adding an alternative set of these counters to the mempool cache
> structure, accessing the dedicated statistics structure is avoided in the
> likely cases where these counters are incremented.
> 
> The trick here is that the cache line holding the mempool cache structure
> is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
> Updating some statistics counters in the same cache line has lower
> performance cost than accessing the statistics counters in the dedicated
> statistics structure, which resides in another cache line.
> 
> mempool_perf_autotest with this patch shows the following improvements in
> rate_persec.
> 
> The cost of enabling mempool stats (without debug) after this patch:
> -6.8 % and -6.7 %, respectively without and with cache.
> 
> v4:
> * Fix checkpatch warnings:
>    A couple of typos in the patch description.
>    The macro to add to a mempool cache stat variable should not use
>    do {} while (0). Personally, I would tend to disagree with this, but
>    whatever keeps the CI happy.
> v3:
> * Don't update the description of the RTE_MEMPOOL_STAT_ADD macro.
>    This change belongs in the first patch of the series.
> v2:
> * Move the statistics counters into a stats structure.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   lib/mempool/rte_mempool.c |  9 ++++++
>   lib/mempool/rte_mempool.h | 66 ++++++++++++++++++++++++++++++++-------
>   2 files changed, 64 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index e6208125e0..a18e39af04 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>   		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
>   		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
>   	}
> +	if (mp->cache_size != 0) {
> +		/* Add the statistics stored in the mempool caches. */
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
> +			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
> +			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
> +			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
> +		}
> +	}
>   	fprintf(f, "  stats:\n");
>   	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
>   	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index abfe34c05f..e6eb573739 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -86,6 +86,19 @@ struct rte_mempool_cache {
>   	uint32_t size;	      /**< Size of the cache */
>   	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>   	uint32_t len;	      /**< Current cache count */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +	uint32_t unused;
> +	/*
> +	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
> +	 * providing faster update access when using a mempool cache.
> +	 */
> +	struct {
> +		uint64_t put_bulk;          /**< Number of puts. */
> +		uint64_t put_objs;          /**< Number of objects successfully put. */
> +		uint64_t get_success_bulk;  /**< Successful allocation number. */
> +		uint64_t get_success_objs;  /**< Objects successfully allocated. */
> +	} stats;                        /**< Statistics */
> +#endif
>   	/**
>   	 * Cache objects
>   	 *
> @@ -319,6 +332,22 @@ struct rte_mempool {
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
>   #endif
>   
> +/**
> + * @internal When stats is enabled, store some statistics.
> + *
> + * @param cache
> + *   Pointer to the memory pool cache.
> + * @param name
> + *   Name of the statistics field to increment in the memory pool cache.
> + * @param n
> + *   Number to add to the statistics.
> + */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)->stats.name += n
> +#else
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
> +#endif
> +
>   /**
>    * @internal Calculate the size of the mempool header.
>    *
> @@ -1333,13 +1362,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>   {
>   	void **cache_objs;
>   
> +	/* No cache provided */
> +	if (unlikely(cache == NULL))
> +		goto driver_enqueue;
> +
>   	/* increment stat now, adding in mempool always success */
> -	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>   
> -	/* No cache provided or the request itself is too big for the cache */
> -	if (unlikely(cache == NULL || n > cache->flushthresh))
> -		goto driver_enqueue;
> +	/* The request itself is too big for the cache */
> +	if (unlikely(n > cache->flushthresh))
> +		goto driver_enqueue_stats_incremented;
>   
>   	/*
>   	 * The cache follows the following algorithm:
> @@ -1364,6 +1397,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>   
>   driver_enqueue:
>   
> +	/* increment stat now, adding in mempool always success */
> +	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> +	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +
> +driver_enqueue_stats_incremented:
> +
>   	/* push objects to the backend */
>   	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
>   }
> @@ -1470,8 +1509,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   	if (remaining == 0) {
>   		/* The entire request is satisfied from the cache. */
>   
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
>   		return 0;
>   	}
> @@ -1500,8 +1539,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   
>   	cache->len = cache->size;
>   
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>   
>   	return 0;
>   
> @@ -1523,8 +1562,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>   		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
>   		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>   	} else {
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		if (likely(cache != NULL)) {
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +		} else {
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		}
>   	}
>   
>   	return ret;

For the series,
Reviewed-By: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v2 2/3] mempool: include non-DPDK threads in statistics
  2022-11-07  7:26                 ` Mattias Rönnblom
@ 2022-11-07  8:56                   ` Morten Brørup
  0 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-07  8:56 UTC (permalink / raw)
  To: Mattias Rönnblom, olivier.matz, andrew.rybchenko, stephen,
	jerinj, bruce.richardson
  Cc: thomas, dev

> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Monday, 7 November 2022 08.27
> 
> On 2022-11-04 11:01, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Friday, 4 November 2022 09.59
> >>
> >> On 2022-11-03 09:59, Morten Brørup wrote:
> >>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >>>> Sent: Wednesday, 2 November 2022 18.53
> >>>>
> >>>> On 2022-11-02 10:09, Morten Brørup wrote:
> >>>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >>>>>> Sent: Wednesday, 2 November 2022 08.53
> >>>>>>
> >>>>>> On 2022-10-31 12:26, Morten Brørup wrote:
> >>>>>>> Offset the stats array index by one, and count non-DPDK threads
> >> at
> >>>>>> index
> >>>>>>> zero.
> >>>>>>>
> >>>>>>> This patch provides two benefits:
> >>>>>>> * Non-DPDK threads are also included in the statistics.
> >>>>>>> * A conditional in the fast path is removed. Static branch
> >>>> prediction
> >>>>>> was
> >>>>>>>       correct, so the performance improvement is negligible.
> >>>>>>>
> >>>>>>> v2:
> >>>>>>> * New. No v1 of this patch in the series.
> >>>>>>>
> >>>>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> >>>>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >>>>>>> ---
> >>>>>>>      lib/mempool/rte_mempool.c |  2 +-
> >>>>>>>      lib/mempool/rte_mempool.h | 12 ++++++------
> >>>>>>>      2 files changed, 7 insertions(+), 7 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/lib/mempool/rte_mempool.c
> >> b/lib/mempool/rte_mempool.c
> >>>>>>> index 62d1ce764e..e6208125e0 100644
> >>>>>>> --- a/lib/mempool/rte_mempool.c
> >>>>>>> +++ b/lib/mempool/rte_mempool.c
> >>>>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
> >> rte_mempool
> >>>>>> *mp)
> >>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>>>      	rte_mempool_ops_get_info(mp, &info);
> >>>>>>>      	memset(&sum, 0, sizeof(sum));
> >>>>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> >>>>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
> >> lcore_id++) {
> >>>>>>>      		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >>>>>>>      		sum.put_objs += mp->stats[lcore_id].put_objs;
> >>>>>>>      		sum.put_common_pool_bulk += mp-
> >>>>>>> stats[lcore_id].put_common_pool_bulk;
> >>>>>>> diff --git a/lib/mempool/rte_mempool.h
> >> b/lib/mempool/rte_mempool.h
> >>>>>>> index 9c4bf5549f..16e7e62e3c 100644
> >>>>>>> --- a/lib/mempool/rte_mempool.h
> >>>>>>> +++ b/lib/mempool/rte_mempool.h
> >>>>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
> >>>>>>>      	struct rte_mempool_memhdr_list mem_list; /**< List of
> >>>> memory
> >>>>>> chunks */
> >>>>>>>
> >>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>>> -	/** Per-lcore statistics. */
> >>>>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> >>>>>>> +	/** Per-lcore statistics.
> >>>>>>> +	 *
> >>>>>>> +	 * Offset by one, to include non-DPDK threads.
> >>>>>>> +	 */
> >>>>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >>>>>>>      #endif
> >>>>>>>      }  __rte_cache_aligned;
> >>>>>>>
> >>>>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
> >>>>>>>       */
> >>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>>>      #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> >>>> \
> >>>>>>> -		unsigned __lcore_id = rte_lcore_id();           \
> >>>>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> >>>>>>> -			mp->stats[__lcore_id].name += n;        \
> >>>>>>> -		}                                               \
> >>>>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> >>>>>>
> >>>>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0,
> for
> >>>> an
> >>>>>> unregistered non-EAL thread? Might be worth a comment, or better
> a
> >>>>>> rewrite with an explicit LCORE_ID_ANY comparison.
> >>>>>
> >>>>> The purpose of this patch is to avoid the comparison here.
> >>>>>
> >>>>> Yes, it relies on the wrap to zero, and these conditions:
> >>>>> 1. LCORE_ID_ANY being UINT32_MAX, and
> >>>>> 2. the return type of rte_lcore_id() being unsigned int, and
> >>>>> 3. unsigned int being uint32_t.
> >>>>>
> >>>>> When I wrote this, I considered it safe to assume that
> LCORE_ID_ANY
> >>>> will remain the unsigned equivalent of -1 using the return type of
> >>>> rte_lcore_id(). In other words: If the return type of
> rte_lcore_id()
> >>>> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> >>>> accordingly to UINT64_MAX.
> >>>>>
> >>>>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> >>>> UINT32_MAX], but just [rte_lcore_id() + 1].
> >>>>>
> >>>>> I struggled writing an appropriate comment without making it
> >>>> unacceptably long, but eventually gave up, and settled for the
> one-
> >> line
> >>>> comment in the structure only.
> >>>>>
> >>>>>>
> >>>>>> You anyways need a conditional. An atomic add must be used in
> the
> >>>>>> unregistered EAL thread case.
> >>>>>
> >>>>> Good point: The "+= n" must be atomic for non-isolated threads.
> >>>>>
> >>>>
> >>>> If the various unregistered non-EAL threads are run on isolated
> >> cores
> >>>> or not does not matter.
> >>>
> >>> Agree: They all share the same index, and thus may race, regardless
> >> which cores they are using.
> >>>
> >>> Rephrasing: The "+= n" must be atomic for the unregistered non-EAL
> >> threads.
> >>>
> >>>>
> >>>>> I just took a look at how software maintained stats are handled
> >>>> elsewhere, and the first thing I found, is the IOAT DMA driver,
> >> which
> >>>> also seems to be using non-atomic increment [1] regardless if used
> >> by a
> >>>> DPDK thread or not.
> >>>>>
> >>>>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
> >>>> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >>>>>
> >>>>> However, doing it wrong elsewhere doesn't make it correct doing
> it
> >>>> wrong here too. :-)
> >>>>>
> >>>>> Atomic increments are costly, so I would rather follow your
> >>>> suggestion and reintroduce the comparison. How about:
> >>>>>
> >>>>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >>>>>        unsigned int __lcore_id = rte_lcore_id(); \
> >>>>>        if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >>>>>            (mp)->stats[__lcore_id].name += n; \
> >>>>>        } else {
> >>>>>            rte_atomic64_add( \
> >>>>>                    (rte_atomic64_t*)&((mp)-
> >>> stats[RTE_MAX_LCORE].name),
> >>>> n);\
> >>>>>        } \
> >>>>> }
> >>>> You are supposed to use GCC C11 intrinsics (e.g.,
> >>>> __atomic_fetch_add()).
> >>>
> >>> Ups. I forgot!
> >>>
> >>> This should be emphasized everywhere in the rte_atomic library, to
> >> prevent such mistakes.
> >>>
> >>>>
> >>>> In addition: technically, you must use an atomic store for the EAL
> >>>> thread case (and an atomic load on the reader side), although
> there
> >> are
> >>>> tons of examples in DPDK where tearing is ignored. (The generated
> >> code
> >>>> will likely look the same.)
> >>>
> >>> The counters are 64 bit aligned, but tearing could happen on 32 bit
> >> architectures.
> >>>
> >>
> >> The compiler is free to do it on any architecture, but I'm not sure
> if
> >> it happens much in practice.
> >>
> >>> My initial reaction to your comment was to do it correctly on the
> EAL
> >> threads too, to avoid tearing there too. However, there might be a
> >> performance cost for 32 bit architectures, so I will consider that
> >> these are only debug counters, and accept the risk of tearing.
> >>>
> >>
> >> What would that cost consist of?
> >
> > I have tested it this morning on Godbolt:
> https://godbolt.org/z/fz7f3cv8b
> >
> > ctr += n becomes:
> >
> >          add     QWORD PTR ctr[rip], rdi
> >
> > Whereas __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
> >
> >          lock add        QWORD PTR ctr[rip], rdi
> >
> 
> Since there is a single writer/producer, only the store need be atomic,
> not the complete load+add+store sequence.
> 
> __atomic_store_n(&ctr, ctr + 1, __ATOMIC_RELAXED);
> 
> Such a construct will not result in a lock add instruction.

I was trying to get rid of the conditional by handling both kinds of threads the same way. However, without the conditional, the unregistered non-EAL threads need to use __atomic_fetch_add would spill over to the EAL threads. So I concluded that the conditional should remain.

> 
> I've happened to finished a draft version of a chapter on this topic,
> in
> the data plane book I'm writing:
> https://ericsson.github.io/dataplanebook/stats/stats.html#per-core-
> counters

Thanks for the link. I just took a look at it, and it looks great!

> 
> I will add a section with this "add an extra instance to deal with
> unregistered non-EAL threads" approach taken in your patch.
> 
> >>
> >> In theory C11 atomics could be implemented "in software" (i.e.,
> without
> >> the proper ISA-level guarantees, with locks), but does any of the
> >> DPDK-supported compiler/32-bit ISAs actually do so?
> >
> > Adding -m32 to the compiler options, ctr += n becomes:
> >
> >          xor     edx, edx
> >          mov     eax, DWORD PTR [esp+4]
> >          add     DWORD PTR ctr, eax
> >          adc     DWORD PTR ctr+4, edx
> >
> > And __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
> >
> >          push    edi
> >          xor     edi, edi
> >          push    esi
> >          push    ebx
> >          sub     esp, 8
> >          mov     esi, DWORD PTR [esp+24]
> >          mov     eax, DWORD PTR ctr
> >          mov     edx, DWORD PTR ctr+4
> > .L4:
> >          mov     ecx, eax
> >          mov     ebx, edx
> >          add     ecx, esi
> >          adc     ebx, edi
> >          mov     DWORD PTR [esp], ecx
> >          mov     DWORD PTR [esp+4], ebx
> >          mov     ebx, DWORD PTR [esp]
> >          mov     ecx, DWORD PTR [esp+4]
> >          lock cmpxchg8b  QWORD PTR ctr
> >          jne     .L4
> >          add     esp, 8
> >          pop     ebx
> >          pop     esi
> >          pop     edi
> >
> >>
> >> It's also not obvious that if there, for a certain
> >> compiler/ISA-combination, is a performance impact to pay for
> >> correctness, correctness would have to give way.
> >
> > In this particular case, and because they are only debug counters, I
> will argue that DPDK generally uses non-atomic access to 64 bit
> statistics counters. This was also the case for these counters before
> this patch.
> >
> > I am planning to finish v3 of this patch series today, so I hope you
> can agree to close the atomicity discussion with another argument:
> Making statistics counters atomic should be elevated to a general patch
> across DPDK, and not part of this mempool patch series.
> >
> 
> As I think I indicated, I think this is a minor issue. I'm OK with the
> code as-written.

Thanks. We can always micro-optimize the unregistered non-EAL thread counters later. :-)

> 
> > The v3 patch (which I am working on right now) counts atomically for
> the unregistered non-EAL threads:
> >
> > #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
> > 		unsigned int __lcore_id = rte_lcore_id();
> \
> > 		if (likely(__lcore_id < RTE_MAX_LCORE))
> \
> > 			(mp)->stats[__lcore_id].name += n;
> \
> > 		else
> \
> > 			__atomic_fetch_add(&((mp)-
> >stats[RTE_MAX_LCORE].name),  \
> > 					   n, __ATOMIC_RELAXED);                \
> > 	} while (0)
> >
> > PS: Excellent discussion - thank you for getting involved, Mattias.
> >


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-04 12:03       ` [PATCH v4 3/3] mempool: use cache for frequently updated stats Morten Brørup
  2022-11-06 11:40         ` Andrew Rybchenko
  2022-11-07  7:30         ` Mattias Rönnblom
@ 2022-11-08  9:20         ` Konstantin Ananyev
  2022-11-08 11:21           ` Morten Brørup
  2 siblings, 1 reply; 39+ messages in thread
From: Konstantin Ananyev @ 2022-11-08  9:20 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, andrew.rybchenko,
	mattias.ronnblom, stephen, jerinj, bruce.richardson
  Cc: hofors, thomas, dev


> When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> performance of mempools with caches is improved as follows.
> 
> When accessing objects in the mempool, either the put_bulk and put_objs or
> the get_success_bulk and get_success_objs statistics counters are likely
> to be incremented.
> 
> By adding an alternative set of these counters to the mempool cache
> structure, accessing the dedicated statistics structure is avoided in the
> likely cases where these counters are incremented.
> 
> The trick here is that the cache line holding the mempool cache structure
> is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
> Updating some statistics counters in the same cache line has lower
> performance cost than accessing the statistics counters in the dedicated
> statistics structure, which resides in another cache line.
> 
> mempool_perf_autotest with this patch shows the following improvements in
> rate_persec.
> 
> The cost of enabling mempool stats (without debug) after this patch:
> -6.8 % and -6.7 %, respectively without and with cache.
> 
> v4:
> * Fix checkpatch warnings:
>   A couple of typos in the patch description.
>   The macro to add to a mempool cache stat variable should not use
>   do {} while (0). Personally, I would tend to disagree with this, but
>   whatever keeps the CI happy.
> v3:
> * Don't update the description of the RTE_MEMPOOL_STAT_ADD macro.
>   This change belongs in the first patch of the series.
> v2:
> * Move the statistics counters into a stats structure.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  lib/mempool/rte_mempool.c |  9 ++++++
>  lib/mempool/rte_mempool.h | 66 ++++++++++++++++++++++++++++++++-------
>  2 files changed, 64 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index e6208125e0..a18e39af04 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -1286,6 +1286,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>  		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
>  		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
>  	}
> +	if (mp->cache_size != 0) {
> +		/* Add the statistics stored in the mempool caches. */
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
> +			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
> +			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
> +			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
> +		}
> +	}
>  	fprintf(f, "  stats:\n");
>  	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
>  	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index abfe34c05f..e6eb573739 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -86,6 +86,19 @@ struct rte_mempool_cache {
>  	uint32_t size;	      /**< Size of the cache */
>  	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>  	uint32_t len;	      /**< Current cache count */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +	uint32_t unused;
> +	/*
> +	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
> +	 * providing faster update access when using a mempool cache.
> +	 */
> +	struct {
> +		uint64_t put_bulk;          /**< Number of puts. */
> +		uint64_t put_objs;          /**< Number of objects successfully put. */
> +		uint64_t get_success_bulk;  /**< Successful allocation number. */
> +		uint64_t get_success_objs;  /**< Objects successfully allocated. */
> +	} stats;                        /**< Statistics */
> +#endif
>  	/**
>  	 * Cache objects
>  	 *
> @@ -319,6 +332,22 @@ struct rte_mempool {
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
>  #endif
> 
> +/**
> + * @internal When stats is enabled, store some statistics.
> + *
> + * @param cache
> + *   Pointer to the memory pool cache.
> + * @param name
> + *   Name of the statistics field to increment in the memory pool cache.
> + * @param n
> + *   Number to add to the statistics.
> + */
> +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)->stats.name += n

As Andrew already pointed, it needs to be: ((cache)->stats.name += (n))
Apart from that, LGTM.
Series-Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

> +#else
> +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
> +#endif
> +
>  /**
>   * @internal Calculate the size of the mempool header.
>   *
> @@ -1333,13 +1362,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  {
>  	void **cache_objs;
> 
> +	/* No cache provided */
> +	if (unlikely(cache == NULL))
> +		goto driver_enqueue;
> +
>  	/* increment stat now, adding in mempool always success */
> -	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> 
> -	/* No cache provided or the request itself is too big for the cache */
> -	if (unlikely(cache == NULL || n > cache->flushthresh))
> -		goto driver_enqueue;
> +	/* The request itself is too big for the cache */
> +	if (unlikely(n > cache->flushthresh))
> +		goto driver_enqueue_stats_incremented;
> 
>  	/*
>  	 * The cache follows the following algorithm:
> @@ -1364,6 +1397,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> 
>  driver_enqueue:
> 
> +	/* increment stat now, adding in mempool always success */
> +	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> +	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> +
> +driver_enqueue_stats_incremented:
> +
>  	/* push objects to the backend */
>  	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
>  }
> @@ -1470,8 +1509,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  	if (remaining == 0) {
>  		/* The entire request is satisfied from the cache. */
> 
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> 
>  		return 0;
>  	}
> @@ -1500,8 +1539,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> 
>  	cache->len = cache->size;
> 
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> 
>  	return 0;
> 
> @@ -1523,8 +1562,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>  		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
>  		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
>  	} else {
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> -		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		if (likely(cache != NULL)) {
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> +			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
> +		} else {
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
> +			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
> +		}
>  	}
> 
>  	return ret;
> --
> 2.17.1
> 


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH v4 3/3] mempool: use cache for frequently updated stats
  2022-11-08  9:20         ` Konstantin Ananyev
@ 2022-11-08 11:21           ` Morten Brørup
  0 siblings, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-08 11:21 UTC (permalink / raw)
  To: thomas
  Cc: hofors, dev, Konstantin Ananyev, olivier.matz, andrew.rybchenko,
	mattias.ronnblom, stephen, jerinj, bruce.richardson

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Tuesday, 8 November 2022 10.20
> 
> > When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> > performance of mempools with caches is improved as follows.
> >
> > When accessing objects in the mempool, either the put_bulk and
> put_objs or
> > the get_success_bulk and get_success_objs statistics counters are
> likely
> > to be incremented.
> >
> > By adding an alternative set of these counters to the mempool cache
> > structure, accessing the dedicated statistics structure is avoided in
> the
> > likely cases where these counters are incremented.
> >
> > The trick here is that the cache line holding the mempool cache
> structure
> > is accessed anyway, in order to access the 'len' or 'flushthresh'
> fields.
> > Updating some statistics counters in the same cache line has lower
> > performance cost than accessing the statistics counters in the
> dedicated
> > statistics structure, which resides in another cache line.
> >
> > mempool_perf_autotest with this patch shows the following
> improvements in
> > rate_persec.
> >
> > The cost of enabling mempool stats (without debug) after this patch:
> > -6.8 % and -6.7 %, respectively without and with cache.
> >
> > v4:
> > * Fix checkpatch warnings:
> >   A couple of typos in the patch description.
> >   The macro to add to a mempool cache stat variable should not use
> >   do {} while (0). Personally, I would tend to disagree with this,
> but
> >   whatever keeps the CI happy.
> > v3:
> > * Don't update the description of the RTE_MEMPOOL_STAT_ADD macro.
> >   This change belongs in the first patch of the series.
> > v2:
> > * Move the statistics counters into a stats structure.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---

[...]

> > +/**
> > + * @internal When stats is enabled, store some statistics.
> > + *
> > + * @param cache
> > + *   Pointer to the memory pool cache.
> > + * @param name
> > + *   Name of the statistics field to increment in the memory pool
> cache.
> > + * @param n
> > + *   Number to add to the statistics.
> > + */
> > +#ifdef RTE_LIBRTE_MEMPOOL_STATS
> > +#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) (cache)->stats.name += n
> 
> As Andrew already pointed, it needs to be: ((cache)->stats.name += (n))
> Apart from that, LGTM.
> Series-Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

@Thomas, this series should be ready to apply... it now has been:
Reviewed-by: (mempool maintainer) Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Reviewed-By: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

Please fix the RTE_MEMPOOL_CACHE_STAT_ADD macro while merging, to satisfy checkpatch. ;-)

It should be:

+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) ((cache)->stats.name += (n))
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v5 1/3] mempool: split stats from debug
  2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
                         ` (2 preceding siblings ...)
  2022-11-06 11:32       ` [PATCH v4 1/3] mempool: split stats from debug Andrew Rybchenko
@ 2022-11-09 18:18       ` Morten Brørup
  2022-11-09 18:18         ` [PATCH v5 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
  2022-11-09 18:18         ` [PATCH v5 3/3] mempool: use cache for frequently updated stats Morten Brørup
  3 siblings, 2 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-09 18:18 UTC (permalink / raw)
  To: thomas, olivier.matz, andrew.rybchenko, mattias.ronnblom,
	stephen, jerinj, bruce.richardson, konstantin.ananyev
  Cc: hofors, david.marchand, dev, Morten Brørup

Split stats from debug, to make mempool statistics available without the
performance cost of continuously validating the debug cookies in the
mempool elements.

mempool_perf_autotest shows the following improvements in rate_persec.

The cost of enabling mempool debug without this patch:
-28.1 % and -74.0 %, respectively without and with cache.

The cost of enabling mempool stats (without debug) after this patch:
-5.8 % and -21.2 %, respectively without and with cache.

v5:
* No changes.
v4:
* No changes.
v3:
* Update the Programmer's Guide.
* Update the description of the RTE_MEMPOOL_STAT_ADD macro.
v2:
* Fix checkpatch warning:
  Use C style comments in rte_include.h, not C++ style.
* Do not rename the rte_mempool_debug_stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 config/rte_config.h                   |  2 ++
 doc/guides/prog_guide/mempool_lib.rst |  6 +++++-
 lib/mempool/rte_mempool.c             |  6 +++---
 lib/mempool/rte_mempool.h             | 11 ++++++-----
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index ae56a86394..3c4876d434 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -47,6 +47,8 @@
 
 /* mempool defines */
 #define RTE_MEMPOOL_CACHE_MAX_SIZE 512
+/* RTE_LIBRTE_MEMPOOL_STATS is not set */
+/* RTE_LIBRTE_MEMPOOL_DEBUG is not set */
 
 /* mbuf defines */
 #define RTE_MBUF_DEFAULT_MEMPOOL_OPS "ring_mp_mc"
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index 55838317b9..4f4ee33463 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -20,12 +20,16 @@ Cookies
 In debug mode, cookies are added at the beginning and end of allocated blocks.
 The allocated objects then contain overwrite protection fields to help debugging buffer overflows.
 
+Debug mode is disabled by default, but can be enabled by setting ``RTE_LIBRTE_MEMPOOL_DEBUG`` in ``config/rte_config.h``.
+
 Stats
 -----
 
-In debug mode, statistics about get from/put in the pool are stored in the mempool structure.
+In stats mode, statistics about get from/put in the pool are stored in the mempool structure.
 Statistics are per-lcore to avoid concurrent access to statistics counters.
 
+Stats mode is disabled by default, but can be enabled by setting ``RTE_LIBRTE_MEMPOOL_STATS`` in ``config/rte_config.h``.
+
 Memory Alignment Constraints on x86 architecture
 ------------------------------------------------
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 21c94a2b9f..62d1ce764e 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -818,7 +818,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
 			  RTE_CACHE_LINE_MASK) != 0);
 	RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
@@ -1221,7 +1221,7 @@ rte_mempool_audit(struct rte_mempool *mp)
 void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	struct rte_mempool_info info;
 	struct rte_mempool_debug_stats sum;
 	unsigned lcore_id;
@@ -1269,7 +1269,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "  common_pool_count=%u\n", common_count);
 
 	/* sum and dump statistics */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 3725a72951..d5f7ea99fa 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -56,7 +56,7 @@ extern "C" {
 #define RTE_MEMPOOL_HEADER_COOKIE2  0xf2eef2eedadd2e55ULL /**< Header cookie. */
 #define RTE_MEMPOOL_TRAILER_COOKIE  0xadd2e55badbadbadULL /**< Trailer cookie.*/
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 /**
  * A structure that stores the mempool statistics (per-lcore).
  * Note: Cache stats (put_cache_bulk/objs, get_cache_bulk/objs) are not
@@ -237,7 +237,7 @@ struct rte_mempool {
 	uint32_t nb_mem_chunks;          /**< Number of memory chunks */
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 	/** Per-lcore statistics. */
 	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
 #endif
@@ -292,17 +292,18 @@ struct rte_mempool {
 	| RTE_MEMPOOL_F_SC_GET \
 	| RTE_MEMPOOL_F_NO_IOVA_CONTIG \
 	)
+
 /**
- * @internal When debug is enabled, store some statistics.
+ * @internal When stats is enabled, store some statistics.
  *
  * @param mp
  *   Pointer to the memory pool.
  * @param name
  *   Name of the statistics field to increment in the memory pool.
  * @param n
- *   Number to add to the object-oriented statistics.
+ *   Number to add to the statistics.
  */
-#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
 		unsigned __lcore_id = rte_lcore_id();           \
 		if (__lcore_id < RTE_MAX_LCORE) {               \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v5 2/3] mempool: add stats for unregistered non-EAL threads
  2022-11-09 18:18       ` [PATCH v5 " Morten Brørup
@ 2022-11-09 18:18         ` Morten Brørup
  2022-11-09 18:18         ` [PATCH v5 3/3] mempool: use cache for frequently updated stats Morten Brørup
  1 sibling, 0 replies; 39+ messages in thread
From: Morten Brørup @ 2022-11-09 18:18 UTC (permalink / raw)
  To: thomas, olivier.matz, andrew.rybchenko, mattias.ronnblom,
	stephen, jerinj, bruce.richardson, konstantin.ananyev
  Cc: hofors, david.marchand, dev, Morten Brørup

This patch adds statistics for unregistered non-EAL threads, which was
previously not included in the statistics.

Add one more entry to the stats array, and use the last index for
unregistered non-EAL threads.

The unregistered non-EAL thread statistics are incremented atomically.

In theory, the EAL thread counters should also be accessed atomically to
avoid tearing on 32 bit architectures. However, it was decided to avoid
the performance cost of using atomic operations, because:
1. these are debug counters, and
2. statistics counters in DPDK are usually incremented non-atomically.

v5:
* Add parentheses around n in the RTE_MEMPOOL_STAT_ADD macro.
v4:
* No changes.
v3 (feedback from Mattias Rönnblom):
* Use correct terminology: Unregistered non-EAL threads.
* Use atomic counting for the unregistered non-EAL threads.
* Reintroduce the conditional instead of offsetting the index by one.
v2:
* New. No v1 of this patch in the series.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/mempool/rte_mempool.c |  2 +-
 lib/mempool/rte_mempool.h | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 62d1ce764e..e6208125e0 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
 		sum.put_objs += mp->stats[lcore_id].put_objs;
 		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index d5f7ea99fa..d8ccb06e2e 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -238,8 +238,11 @@ struct rte_mempool {
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	/** Per-lcore statistics.
+	 *
+	 * Plus one, for unregistered non-EAL threads.
+	 */
+	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -304,11 +307,13 @@ struct rte_mempool {
  *   Number to add to the statistics.
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-		unsigned __lcore_id = rte_lcore_id();           \
-		if (__lcore_id < RTE_MAX_LCORE) {               \
-			mp->stats[__lcore_id].name += n;        \
-		}                                               \
+#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
+		unsigned int __lcore_id = rte_lcore_id();                       \
+		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
+			(mp)->stats[__lcore_id].name += (n);                    \
+		else                                                            \
+			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
+					   (n), __ATOMIC_RELAXED);              \
 	} while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v5 3/3] mempool: use cache for frequently updated stats
  2022-11-09 18:18       ` [PATCH v5 " Morten Brørup
  2022-11-09 18:18         ` [PATCH v5 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
@ 2022-11-09 18:18         ` Morten Brørup
  2022-11-10 16:36           ` Thomas Monjalon
  1 sibling, 1 reply; 39+ messages in thread
From: Morten Brørup @ 2022-11-09 18:18 UTC (permalink / raw)
  To: thomas, olivier.matz, andrew.rybchenko, mattias.ronnblom,
	stephen, jerinj, bruce.richardson, konstantin.ananyev
  Cc: hofors, david.marchand, dev, Morten Brørup

When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
performance of mempools with caches is improved as follows.

When accessing objects in the mempool, either the put_bulk and put_objs or
the get_success_bulk and get_success_objs statistics counters are likely
to be incremented.

By adding an alternative set of these counters to the mempool cache
structure, accessing the dedicated statistics structure is avoided in the
likely cases where these counters are incremented.

The trick here is that the cache line holding the mempool cache structure
is accessed anyway, in order to access the 'len' or 'flushthresh' fields.
Updating some statistics counters in the same cache line has lower
performance cost than accessing the statistics counters in the dedicated
statistics structure, which resides in another cache line.

mempool_perf_autotest with this patch shows the following improvements in
rate_persec.

The cost of enabling mempool stats (without debug) after this patch:
-6.8 % and -6.7 %, respectively without and with cache.

v5:
* Add SmartShare Systems to the list of copyright holders.
* Add parentheses around n in the RTE_MEMPOOL_CACHE_STAT_ADD macro.
* Fix checkpatch warning:
  The macro to add to a mempool cache stat variable should be enclosed in
  parentheses.
v4:
* Fix checkpatch warnings:
  A couple of typos in the patch description.
  The macro to add to a mempool cache stat variable should not use
  do {} while (0). Personally, I would tend to disagree with this, but
  whatever keeps the CI happy.
v3:
* Don't update the description of the RTE_MEMPOOL_STAT_ADD macro.
  This change belongs in the first patch of the series.
v2:
* Move the statistics counters into a stats structure.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/mempool/rte_mempool.c | 10 ++++++
 lib/mempool/rte_mempool.h | 67 ++++++++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index e6208125e0..f33f455790 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation.
  * Copyright(c) 2016 6WIND S.A.
+ * Copyright(c) 2022 SmartShare Systems
  */
 
 #include <stdbool.h>
@@ -1286,6 +1287,15 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
 		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
+	if (mp->cache_size != 0) {
+		/* Add the statistics stored in the mempool caches. */
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+			sum.put_bulk += mp->local_cache[lcore_id].stats.put_bulk;
+			sum.put_objs += mp->local_cache[lcore_id].stats.put_objs;
+			sum.get_success_bulk += mp->local_cache[lcore_id].stats.get_success_bulk;
+			sum.get_success_objs += mp->local_cache[lcore_id].stats.get_success_objs;
+		}
+	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
 	fprintf(f, "    put_objs=%"PRIu64"\n", sum.put_objs);
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index d8ccb06e2e..631f205d53 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation.
  * Copyright(c) 2016 6WIND S.A.
+ * Copyright(c) 2022 SmartShare Systems
  */
 
 #ifndef _RTE_MEMPOOL_H_
@@ -86,6 +87,19 @@ struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+	uint32_t unused;
+	/*
+	 * Alternative location for the most frequently updated mempool statistics (per-lcore),
+	 * providing faster update access when using a mempool cache.
+	 */
+	struct {
+		uint64_t put_bulk;          /**< Number of puts. */
+		uint64_t put_objs;          /**< Number of objects successfully put. */
+		uint64_t get_success_bulk;  /**< Successful allocation number. */
+		uint64_t get_success_objs;  /**< Objects successfully allocated. */
+	} stats;                        /**< Statistics */
+#endif
 	/**
 	 * Cache objects
 	 *
@@ -319,6 +333,22 @@ struct rte_mempool {
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
 #endif
 
+/**
+ * @internal When stats is enabled, store some statistics.
+ *
+ * @param cache
+ *   Pointer to the memory pool cache.
+ * @param name
+ *   Name of the statistics field to increment in the memory pool cache.
+ * @param n
+ *   Number to add to the statistics.
+ */
+#ifdef RTE_LIBRTE_MEMPOOL_STATS
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) ((cache)->stats.name += (n))
+#else
+#define RTE_MEMPOOL_CACHE_STAT_ADD(cache, name, n) do {} while (0)
+#endif
+
 /**
  * @internal Calculate the size of the mempool header.
  *
@@ -1333,13 +1363,17 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 {
 	void **cache_objs;
 
+	/* No cache provided */
+	if (unlikely(cache == NULL))
+		goto driver_enqueue;
+
 	/* increment stat now, adding in mempool always success */
-	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
 
-	/* No cache provided or the request itself is too big for the cache */
-	if (unlikely(cache == NULL || n > cache->flushthresh))
-		goto driver_enqueue;
+	/* The request itself is too big for the cache */
+	if (unlikely(n > cache->flushthresh))
+		goto driver_enqueue_stats_incremented;
 
 	/*
 	 * The cache follows the following algorithm:
@@ -1364,6 +1398,12 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 driver_enqueue:
 
+	/* increment stat now, adding in mempool always success */
+	RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
+	RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
+
+driver_enqueue_stats_incremented:
+
 	/* push objects to the backend */
 	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 }
@@ -1470,8 +1510,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 	if (remaining == 0) {
 		/* The entire request is satisfied from the cache. */
 
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+		RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 		return 0;
 	}
@@ -1500,8 +1540,8 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 
 	cache->len = cache->size;
 
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-	RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+	RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
 
 	return 0;
 
@@ -1523,8 +1563,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
 		RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);
 	} else {
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
-		RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		if (likely(cache != NULL)) {
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
+			RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
+		} else {
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
+			RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
+		}
 	}
 
 	return ret;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v5 3/3] mempool: use cache for frequently updated stats
  2022-11-09 18:18         ` [PATCH v5 3/3] mempool: use cache for frequently updated stats Morten Brørup
@ 2022-11-10 16:36           ` Thomas Monjalon
  0 siblings, 0 replies; 39+ messages in thread
From: Thomas Monjalon @ 2022-11-10 16:36 UTC (permalink / raw)
  To: Morten Brørup
  Cc: olivier.matz, andrew.rybchenko, mattias.ronnblom, stephen,
	jerinj, bruce.richardson, konstantin.ananyev, dev, hofors,
	david.marchand, dev

09/11/2022 19:18, Morten Brørup:
> When built with stats enabled (RTE_LIBRTE_MEMPOOL_STATS defined), the
> performance of mempools with caches is improved as follows.

Series applied, thanks.





^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2022-11-10 16:36 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-30 11:54 [PATCH] mempool: split statistics from debug Morten Brørup
2022-10-30 14:04 ` Morten Brørup
2022-10-30 16:12   ` Stephen Hemminger
2022-10-30 20:29     ` Morten Brørup
2022-10-31 11:26 ` [PATCH v2 1/3] " Morten Brørup
2022-10-31 11:26   ` [PATCH v2 2/3] mempool: include non-DPDK threads in statistics Morten Brørup
2022-11-02  7:52     ` Mattias Rönnblom
2022-11-02  9:09       ` Morten Brørup
2022-11-02 15:19         ` Stephen Hemminger
2022-11-02 15:37           ` Morten Brørup
2022-11-02 17:53         ` Mattias Rönnblom
2022-11-03  8:59           ` Morten Brørup
2022-11-04  8:58             ` Mattias Rönnblom
2022-11-04 10:01               ` Morten Brørup
2022-11-07  7:26                 ` Mattias Rönnblom
2022-11-07  8:56                   ` Morten Brørup
2022-10-31 11:26   ` [PATCH v2 3/3] mempool: use cache for frequently updated statistics Morten Brørup
2022-11-02  8:01     ` Mattias Rönnblom
2022-11-02  9:29       ` Morten Brørup
2022-11-02 17:55         ` Mattias Rönnblom
2022-11-04 11:17   ` [PATCH v3 1/3] mempool: split stats from debug Morten Brørup
2022-11-04 11:17     ` [PATCH v3 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
2022-11-04 11:17     ` [PATCH v3 3/3] mempool: use cache for frequently updated stats Morten Brørup
2022-11-04 12:03     ` [PATCH v4 1/3] mempool: split stats from debug Morten Brørup
2022-11-04 12:03       ` [PATCH v4 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
2022-11-06 11:34         ` Andrew Rybchenko
2022-11-04 12:03       ` [PATCH v4 3/3] mempool: use cache for frequently updated stats Morten Brørup
2022-11-06 11:40         ` Andrew Rybchenko
2022-11-06 11:50           ` Morten Brørup
2022-11-06 11:59             ` Andrew Rybchenko
2022-11-06 12:16               ` Morten Brørup
2022-11-07  7:30         ` Mattias Rönnblom
2022-11-08  9:20         ` Konstantin Ananyev
2022-11-08 11:21           ` Morten Brørup
2022-11-06 11:32       ` [PATCH v4 1/3] mempool: split stats from debug Andrew Rybchenko
2022-11-09 18:18       ` [PATCH v5 " Morten Brørup
2022-11-09 18:18         ` [PATCH v5 2/3] mempool: add stats for unregistered non-EAL threads Morten Brørup
2022-11-09 18:18         ` [PATCH v5 3/3] mempool: use cache for frequently updated stats Morten Brørup
2022-11-10 16:36           ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).