DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] mempool: test performance with constant n
@ 2022-01-19 11:37 Morten Brørup
  2022-01-24 10:26 ` Olivier Matz
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Morten Brørup @ 2022-01-19 11:37 UTC (permalink / raw)
  To: olivier.matz, andrew.rybchenko
  Cc: thomas, bruce.richardson, jerinjacobk, dev, Morten Brørup

"What gets measured gets done."

This patch adds mempool performance tests where the number of objects to
put and get is constant at compile time, which may significantly improve
the performance of these functions. [*]

Also, it is ensured that the array holding the object used for testing
is cache line aligned, for maximum performance.

And finally, the following entries are added to the list of tests:
- Number of kept objects: 512
- Number of objects to get and to put: The number of pointers fitting
  into a cache line, i.e. 8 or 16

[*] Some example performance test (with cache) results:

get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462

get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643

get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 app/test/test_mempool_perf.c | 120 +++++++++++++++++++++--------------
 1 file changed, 74 insertions(+), 46 deletions(-)

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index 87ad251367..ffefe934d5 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
  */
 
 #include <string.h>
@@ -55,19 +56,24 @@
  *
  *      - Bulk get from 1 to 32
  *      - Bulk put from 1 to 32
+ *      - Bulk get and put from 1 to 32, compile time constant
  *
  *    - Number of kept objects (*n_keep*)
  *
  *      - 32
  *      - 128
+ *      - 512
  */
 
 #define N 65536
 #define TIME_S 5
 #define MEMPOOL_ELT_SIZE 2048
-#define MAX_KEEP 128
+#define MAX_KEEP 512
 #define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
 
+/* Number of pointers fitting into one cache line. */
+#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE/sizeof(uintptr_t))
+
 #define LOG_ERR() printf("test failed at %s():%d\n", __func__, __LINE__)
 #define RET_ERR() do {							\
 		LOG_ERR();						\
@@ -80,16 +86,16 @@
 	} while (0)
 
 static int use_external_cache;
-static unsigned external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+static unsigned int external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
 
 static uint32_t synchro;
 
 /* number of objects in one bulk operation (get or put) */
-static unsigned n_get_bulk;
-static unsigned n_put_bulk;
+static int n_get_bulk;
+static int n_put_bulk;
 
 /* number of objects retrieved from mempool before putting them back */
-static unsigned n_keep;
+static int n_keep;
 
 /* number of enqueues / dequeues */
 struct mempool_test_stats {
@@ -104,20 +110,43 @@ static struct mempool_test_stats stats[RTE_MAX_LCORE];
  */
 static void
 my_obj_init(struct rte_mempool *mp, __rte_unused void *arg,
-	    void *obj, unsigned i)
+	    void *obj, unsigned int i)
 {
 	uint32_t *objnum = obj;
 	memset(obj, 0, mp->elt_size);
 	*objnum = i;
 }
 
+#define test_loop(x_keep, x_get_bulk, x_put_bulk)		       \
+		for (i = 0; likely(i < (N/x_keep)); i++) {	      \
+			/* get x_keep objects by bulk of x_get_bulk */  \
+			for (idx = 0; idx < x_keep; idx += x_get_bulk) {\
+				ret = rte_mempool_generic_get(mp,       \
+						&obj_table[idx],	\
+						x_get_bulk,	     \
+						cache);		 \
+				if (unlikely(ret < 0)) {		\
+					rte_mempool_dump(stdout, mp);   \
+					GOTO_ERR(ret, out);	     \
+				}				       \
+			}					       \
+									\
+			/* put the objects back by bulk of x_put_bulk */\
+			for (idx = 0; idx < x_keep; idx += x_put_bulk) {\
+				rte_mempool_generic_put(mp,	     \
+						&obj_table[idx],	\
+						x_put_bulk,	     \
+						cache);		 \
+			}					       \
+		}
+
 static int
 per_lcore_mempool_test(void *arg)
 {
-	void *obj_table[MAX_KEEP];
-	unsigned i, idx;
+	void *obj_table[MAX_KEEP] __rte_cache_aligned;
+	int i, idx;
 	struct rte_mempool *mp = arg;
-	unsigned lcore_id = rte_lcore_id();
+	unsigned int lcore_id = rte_lcore_id();
 	int ret = 0;
 	uint64_t start_cycles, end_cycles;
 	uint64_t time_diff = 0, hz = rte_get_timer_hz();
@@ -139,6 +168,9 @@ per_lcore_mempool_test(void *arg)
 		GOTO_ERR(ret, out);
 	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
 		GOTO_ERR(ret, out);
+	/* for constant n, n_get_bulk and n_put_bulk must be the same */
+	if (n_get_bulk < 0 && n_put_bulk != n_get_bulk)
+		GOTO_ERR(ret, out);
 
 	stats[lcore_id].enq_count = 0;
 
@@ -149,30 +181,18 @@ per_lcore_mempool_test(void *arg)
 	start_cycles = rte_get_timer_cycles();
 
 	while (time_diff/hz < TIME_S) {
-		for (i = 0; likely(i < (N/n_keep)); i++) {
-			/* get n_keep objects by bulk of n_bulk */
-			idx = 0;
-			while (idx < n_keep) {
-				ret = rte_mempool_generic_get(mp,
-							      &obj_table[idx],
-							      n_get_bulk,
-							      cache);
-				if (unlikely(ret < 0)) {
-					rte_mempool_dump(stdout, mp);
-					/* in this case, objects are lost... */
-					GOTO_ERR(ret, out);
-				}
-				idx += n_get_bulk;
-			}
-
-			/* put the objects back */
-			idx = 0;
-			while (idx < n_keep) {
-				rte_mempool_generic_put(mp, &obj_table[idx],
-							n_put_bulk,
-							cache);
-				idx += n_put_bulk;
-			}
+		if (n_get_bulk > 0) {
+			test_loop(n_keep, n_get_bulk, n_put_bulk);
+		} else if (n_get_bulk == -1) {
+			test_loop(-n_keep, 1, 1);
+		} else if (n_get_bulk == -4) {
+			test_loop(-n_keep, 4, 4);
+		} else if (n_get_bulk == -(int)CACHE_LINE_BURST) {
+			test_loop(-n_keep, CACHE_LINE_BURST, CACHE_LINE_BURST);
+		} else if (n_get_bulk == -32) {
+			test_loop(-n_keep, 32, 32);
+		} else {
+			GOTO_ERR(ret, out);
 		}
 		end_cycles = rte_get_timer_cycles();
 		time_diff = end_cycles - start_cycles;
@@ -192,10 +212,10 @@ per_lcore_mempool_test(void *arg)
 static int
 launch_cores(struct rte_mempool *mp, unsigned int cores)
 {
-	unsigned lcore_id;
+	unsigned int lcore_id;
 	uint64_t rate;
 	int ret;
-	unsigned cores_save = cores;
+	unsigned int cores_save = cores;
 
 	__atomic_store_n(&synchro, 0, __ATOMIC_RELAXED);
 
@@ -203,10 +223,14 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 	memset(stats, 0, sizeof(stats));
 
 	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
-	       "n_put_bulk=%u n_keep=%u ",
+	       "n_put_bulk=%u n_keep=%u constant_n=%s ",
 	       use_external_cache ?
-		   external_cache_size : (unsigned) mp->cache_size,
-	       cores, n_get_bulk, n_put_bulk, n_keep);
+		   external_cache_size : (unsigned int) mp->cache_size,
+	       cores,
+	       n_get_bulk > 0 ? n_get_bulk : -n_get_bulk,
+	       n_put_bulk > 0 ? n_put_bulk : -n_put_bulk,
+	       n_keep > 0 ? n_keep : -n_keep,
+	       n_get_bulk > 0 ? "false" : "true");
 
 	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
 		printf("mempool is not full\n");
@@ -253,12 +277,13 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 static int
 do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 {
-	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
-	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
-	unsigned keep_tab[] = { 32, 128, 0 };
-	unsigned *get_bulk_ptr;
-	unsigned *put_bulk_ptr;
-	unsigned *keep_ptr;
+	/* Negative n_get_bulk values represent constants in the test. */
+	int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, -1, -4, -(int)CACHE_LINE_BURST, -32, 0 };
+	int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
+	int keep_tab[] = { 32, 128, 512, 0 };
+	int *get_bulk_ptr;
+	int *put_bulk_ptr;
+	int *keep_ptr;
 	int ret;
 
 	for (get_bulk_ptr = bulk_tab_get; *get_bulk_ptr; get_bulk_ptr++) {
@@ -266,13 +291,16 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
 
 				n_get_bulk = *get_bulk_ptr;
-				n_put_bulk = *put_bulk_ptr;
-				n_keep = *keep_ptr;
+				n_put_bulk = n_get_bulk > 0 ? *put_bulk_ptr : n_get_bulk;
+				n_keep = n_get_bulk > 0 ? *keep_ptr : -*keep_ptr;
 				ret = launch_cores(mp, cores);
 
 				if (ret < 0)
 					return -1;
 			}
+			/* Skip the put loop when n is constant */
+			if (n_get_bulk < 0)
+				break;
 		}
 	}
 	return 0;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mempool: test performance with constant n
  2022-01-19 11:37 [PATCH] mempool: test performance with constant n Morten Brørup
@ 2022-01-24 10:26 ` Olivier Matz
  2022-01-24 10:37   ` Morten Brørup
  2022-01-24 14:53 ` Olivier Matz
  2022-01-24 14:59 ` [PATCH v2] " Olivier Matz
  2 siblings, 1 reply; 9+ messages in thread
From: Olivier Matz @ 2022-01-24 10:26 UTC (permalink / raw)
  To: Morten Brørup
  Cc: andrew.rybchenko, thomas, bruce.richardson, jerinjacobk, dev

Hi Morten,

Thank you for enhancing the mempool test. Please see some comments
below.

On Wed, Jan 19, 2022 at 12:37:32PM +0100, Morten Brørup wrote:
> "What gets measured gets done."
> 
> This patch adds mempool performance tests where the number of objects to
> put and get is constant at compile time, which may significantly improve
> the performance of these functions. [*]
> 
> Also, it is ensured that the array holding the object used for testing
> is cache line aligned, for maximum performance.
> 
> And finally, the following entries are added to the list of tests:
> - Number of kept objects: 512
> - Number of objects to get and to put: The number of pointers fitting
>   into a cache line, i.e. 8 or 16
> 
> [*] Some example performance test (with cache) results:
> 
> get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
> get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462
> 
> get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
> get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643
> 
> get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
> get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  app/test/test_mempool_perf.c | 120 +++++++++++++++++++++--------------
>  1 file changed, 74 insertions(+), 46 deletions(-)
> 
> diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
> index 87ad251367..ffefe934d5 100644
> --- a/app/test/test_mempool_perf.c
> +++ b/app/test/test_mempool_perf.c
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2022 SmartShare Systems
>   */
>  
>  #include <string.h>
> @@ -55,19 +56,24 @@
>   *
>   *      - Bulk get from 1 to 32
>   *      - Bulk put from 1 to 32
> + *      - Bulk get and put from 1 to 32, compile time constant
>   *
>   *    - Number of kept objects (*n_keep*)
>   *
>   *      - 32
>   *      - 128
> + *      - 512
>   */
>  
>  #define N 65536
>  #define TIME_S 5
>  #define MEMPOOL_ELT_SIZE 2048
> -#define MAX_KEEP 128
> +#define MAX_KEEP 512
>  #define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
>  
> +/* Number of pointers fitting into one cache line. */
> +#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE/sizeof(uintptr_t))
> +

nit: I think it's better to follow the coding rules and add a space around the
'/', even if I can see the line right above does not follow this convention

>  #define LOG_ERR() printf("test failed at %s():%d\n", __func__, __LINE__)
>  #define RET_ERR() do {							\
>  		LOG_ERR();						\
> @@ -80,16 +86,16 @@
>  	} while (0)
>  
>  static int use_external_cache;
> -static unsigned external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
> +static unsigned int external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
>  
>  static uint32_t synchro;
>  
>  /* number of objects in one bulk operation (get or put) */
> -static unsigned n_get_bulk;
> -static unsigned n_put_bulk;
> +static int n_get_bulk;
> +static int n_put_bulk;
>  
>  /* number of objects retrieved from mempool before putting them back */
> -static unsigned n_keep;
> +static int n_keep;
>  
>  /* number of enqueues / dequeues */
>  struct mempool_test_stats {
> @@ -104,20 +110,43 @@ static struct mempool_test_stats stats[RTE_MAX_LCORE];
>   */
>  static void
>  my_obj_init(struct rte_mempool *mp, __rte_unused void *arg,
> -	    void *obj, unsigned i)
> +	    void *obj, unsigned int i)
>  {
>  	uint32_t *objnum = obj;
>  	memset(obj, 0, mp->elt_size);
>  	*objnum = i;
>  }
>  
> +#define test_loop(x_keep, x_get_bulk, x_put_bulk)		       \
> +		for (i = 0; likely(i < (N/x_keep)); i++) {	      \
> +			/* get x_keep objects by bulk of x_get_bulk */  \
> +			for (idx = 0; idx < x_keep; idx += x_get_bulk) {\
> +				ret = rte_mempool_generic_get(mp,       \
> +						&obj_table[idx],	\
> +						x_get_bulk,	     \
> +						cache);		 \
> +				if (unlikely(ret < 0)) {		\
> +					rte_mempool_dump(stdout, mp);   \
> +					GOTO_ERR(ret, out);	     \
> +				}				       \
> +			}					       \
> +									\
> +			/* put the objects back by bulk of x_put_bulk */\
> +			for (idx = 0; idx < x_keep; idx += x_put_bulk) {\
> +				rte_mempool_generic_put(mp,	     \
> +						&obj_table[idx],	\
> +						x_put_bulk,	     \
> +						cache);		 \
> +			}					       \
> +		}
> +

I think a static __rte_always_inline function would do the job and is
clearer. Something like this:

static __rte_always_inline int
test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int x_put_bulk)
{
	void *obj_table[MAX_KEEP] __rte_cache_aligned;
	unsigned int idx;
	unsigned int i;
	int ret;

	for (i = 0; likely(i < (N / x_keep)); i++) {
		/* get x_keep objects by bulk of x_get_bulk */
		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
			ret = rte_mempool_generic_get(mp,
						      &obj_table[idx],
						      x_get_bulk,
						      cache);
			if (unlikely(ret < 0)) {
				rte_mempool_dump(stdout, mp);
				return ret;
			}
		}

		/* put the objects back by bulk of x_put_bulk */
		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
			rte_mempool_generic_put(mp,
						&obj_table[idx],
						x_put_bulk,
						cache);
		}
	}

	return 0;
}


>  static int
>  per_lcore_mempool_test(void *arg)
>  {
> -	void *obj_table[MAX_KEEP];
> -	unsigned i, idx;
> +	void *obj_table[MAX_KEEP] __rte_cache_aligned;
> +	int i, idx;
>  	struct rte_mempool *mp = arg;
> -	unsigned lcore_id = rte_lcore_id();
> +	unsigned int lcore_id = rte_lcore_id();
>  	int ret = 0;
>  	uint64_t start_cycles, end_cycles;
>  	uint64_t time_diff = 0, hz = rte_get_timer_hz();
> @@ -139,6 +168,9 @@ per_lcore_mempool_test(void *arg)
>  		GOTO_ERR(ret, out);
>  	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
>  		GOTO_ERR(ret, out);
> +	/* for constant n, n_get_bulk and n_put_bulk must be the same */
> +	if (n_get_bulk < 0 && n_put_bulk != n_get_bulk)
> +		GOTO_ERR(ret, out);
>  
>  	stats[lcore_id].enq_count = 0;
>  
> @@ -149,30 +181,18 @@ per_lcore_mempool_test(void *arg)
>  	start_cycles = rte_get_timer_cycles();
>  
>  	while (time_diff/hz < TIME_S) {
> -		for (i = 0; likely(i < (N/n_keep)); i++) {
> -			/* get n_keep objects by bulk of n_bulk */
> -			idx = 0;
> -			while (idx < n_keep) {
> -				ret = rte_mempool_generic_get(mp,
> -							      &obj_table[idx],
> -							      n_get_bulk,
> -							      cache);
> -				if (unlikely(ret < 0)) {
> -					rte_mempool_dump(stdout, mp);
> -					/* in this case, objects are lost... */
> -					GOTO_ERR(ret, out);
> -				}
> -				idx += n_get_bulk;
> -			}
> -
> -			/* put the objects back */
> -			idx = 0;
> -			while (idx < n_keep) {
> -				rte_mempool_generic_put(mp, &obj_table[idx],
> -							n_put_bulk,
> -							cache);
> -				idx += n_put_bulk;
> -			}
> +		if (n_get_bulk > 0) {
> +			test_loop(n_keep, n_get_bulk, n_put_bulk);
> +		} else if (n_get_bulk == -1) {
> +			test_loop(-n_keep, 1, 1);
> +		} else if (n_get_bulk == -4) {
> +			test_loop(-n_keep, 4, 4);
> +		} else if (n_get_bulk == -(int)CACHE_LINE_BURST) {
> +			test_loop(-n_keep, CACHE_LINE_BURST, CACHE_LINE_BURST);
> +		} else if (n_get_bulk == -32) {
> +			test_loop(-n_keep, 32, 32);
> +		} else {
> +			GOTO_ERR(ret, out);
>  		}
>  		end_cycles = rte_get_timer_cycles();
>  		time_diff = end_cycles - start_cycles;

I'm not convinced that having negative values to mean "constant" is
clear. I'd prefer to have another global variable "use_constant_values",
which would give something like this:

		if (!use_constant_values)
			ret = test_loop(mp, cache, n_keep, n_get_bulk, n_put_bulk);
		else if (n_get_bulk == 1)
			ret = test_loop(mp, cache, n_keep, 1, 1);
		else if (n_get_bulk == 4)
			ret = test_loop(mp, cache, n_keep, 4, 4);
		else if (n_get_bulk == CACHE_LINE_BURST)
			ret = test_loop(mp, cache, n_keep,
					CACHE_LINE_BURST, CACHE_LINE_BURST);
		else if (n_get_bulk == 32)
			ret = test_loop(mp, cache, n_keep, 32, 32);
		else
			ret = -1;

		if (ret < 0)
			GOTO_ERR(ret, out);


> @@ -192,10 +212,10 @@ per_lcore_mempool_test(void *arg)
>  static int
>  launch_cores(struct rte_mempool *mp, unsigned int cores)
>  {
> -	unsigned lcore_id;
> +	unsigned int lcore_id;
>  	uint64_t rate;
>  	int ret;
> -	unsigned cores_save = cores;
> +	unsigned int cores_save = cores;
>  
>  	__atomic_store_n(&synchro, 0, __ATOMIC_RELAXED);
>  
> @@ -203,10 +223,14 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
>  	memset(stats, 0, sizeof(stats));
>  
>  	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
> -	       "n_put_bulk=%u n_keep=%u ",
> +	       "n_put_bulk=%u n_keep=%u constant_n=%s ",
>  	       use_external_cache ?
> -		   external_cache_size : (unsigned) mp->cache_size,
> -	       cores, n_get_bulk, n_put_bulk, n_keep);
> +		   external_cache_size : (unsigned int) mp->cache_size,
> +	       cores,
> +	       n_get_bulk > 0 ? n_get_bulk : -n_get_bulk,
> +	       n_put_bulk > 0 ? n_put_bulk : -n_put_bulk,
> +	       n_keep > 0 ? n_keep : -n_keep,
> +	       n_get_bulk > 0 ? "false" : "true");
>  

This would become much simpler with this new variable.

>  	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
>  		printf("mempool is not full\n");
> @@ -253,12 +277,13 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
>  static int
>  do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
>  {
> -	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
> -	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
> -	unsigned keep_tab[] = { 32, 128, 0 };
> -	unsigned *get_bulk_ptr;
> -	unsigned *put_bulk_ptr;
> -	unsigned *keep_ptr;
> +	/* Negative n_get_bulk values represent constants in the test. */
> +	int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, -1, -4, -(int)CACHE_LINE_BURST, -32, 0 };
> +	int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> +	int keep_tab[] = { 32, 128, 512, 0 };
> +	int *get_bulk_ptr;
> +	int *put_bulk_ptr;
> +	int *keep_ptr;
>  	int ret;
>  

Same here, changes would be minimal.

>  	for (get_bulk_ptr = bulk_tab_get; *get_bulk_ptr; get_bulk_ptr++) {
> @@ -266,13 +291,16 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
>  			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
>  
>  				n_get_bulk = *get_bulk_ptr;
> -				n_put_bulk = *put_bulk_ptr;
> -				n_keep = *keep_ptr;
> +				n_put_bulk = n_get_bulk > 0 ? *put_bulk_ptr : n_get_bulk;
> +				n_keep = n_get_bulk > 0 ? *keep_ptr : -*keep_ptr;
>  				ret = launch_cores(mp, cores);
>  
>  				if (ret < 0)
>  					return -1;

No change would be required above (except use_constant_values = 0).
And below, we could simply add instead:

				/* replay test with constant values */
				if (n_get_bulk == n_put_bulk) {
					use_constant_values = 1;
					ret = launch_cores(mp, cores);
					if (ret < 0)
						return -1;
				}



If you are ok with the proposed changes, I can directly submit a v2,
since I already made them locally.

Thanks,
Olivier

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] mempool: test performance with constant n
  2022-01-24 10:26 ` Olivier Matz
@ 2022-01-24 10:37   ` Morten Brørup
  0 siblings, 0 replies; 9+ messages in thread
From: Morten Brørup @ 2022-01-24 10:37 UTC (permalink / raw)
  To: Olivier Matz; +Cc: andrew.rybchenko, thomas, bruce.richardson, jerinjacobk, dev

> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Monday, 24 January 2022 11.26
> 
> Hi Morten,
> 
> Thank you for enhancing the mempool test. Please see some comments
> below.
> 
> On Wed, Jan 19, 2022 at 12:37:32PM +0100, Morten Brørup wrote:
> > "What gets measured gets done."
> >
> > This patch adds mempool performance tests where the number of objects
> to
> > put and get is constant at compile time, which may significantly
> improve
> > the performance of these functions. [*]
> >
> > Also, it is ensured that the array holding the object used for
> testing
> > is cache line aligned, for maximum performance.
> >
> > And finally, the following entries are added to the list of tests:
> > - Number of kept objects: 512
> > - Number of objects to get and to put: The number of pointers fitting
> >   into a cache line, i.e. 8 or 16
> >
> > [*] Some example performance test (with cache) results:
> >
> > get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
> > get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462
> >
> > get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
> > get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643
> >
> > get_bulk=32 put_bulk=32 keep=32 constant_n=false
> rate_persec=871248691
> > get_bulk=32 put_bulk=32 keep=32 constant_n=true
> rate_persec=1134021836
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> >  app/test/test_mempool_perf.c | 120 +++++++++++++++++++++------------
> --
> >  1 file changed, 74 insertions(+), 46 deletions(-)
> >
> > diff --git a/app/test/test_mempool_perf.c
> b/app/test/test_mempool_perf.c
> > index 87ad251367..ffefe934d5 100644
> > --- a/app/test/test_mempool_perf.c
> > +++ b/app/test/test_mempool_perf.c
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2010-2014 Intel Corporation
> > + * Copyright(c) 2022 SmartShare Systems
> >   */
> >
> >  #include <string.h>
> > @@ -55,19 +56,24 @@
> >   *
> >   *      - Bulk get from 1 to 32
> >   *      - Bulk put from 1 to 32
> > + *      - Bulk get and put from 1 to 32, compile time constant
> >   *
> >   *    - Number of kept objects (*n_keep*)
> >   *
> >   *      - 32
> >   *      - 128
> > + *      - 512
> >   */
> >
> >  #define N 65536
> >  #define TIME_S 5
> >  #define MEMPOOL_ELT_SIZE 2048
> > -#define MAX_KEEP 128
> > +#define MAX_KEEP 512
> >  #define MEMPOOL_SIZE
> ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
> >
> > +/* Number of pointers fitting into one cache line. */
> > +#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE/sizeof(uintptr_t))
> > +
> 
> nit: I think it's better to follow the coding rules and add a space
> around the
> '/', even if I can see the line right above does not follow this
> convention
> 
> >  #define LOG_ERR() printf("test failed at %s():%d\n", __func__,
> __LINE__)
> >  #define RET_ERR() do {							\
> >  		LOG_ERR();						\
> > @@ -80,16 +86,16 @@
> >  	} while (0)
> >
> >  static int use_external_cache;
> > -static unsigned external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
> > +static unsigned int external_cache_size =
> RTE_MEMPOOL_CACHE_MAX_SIZE;
> >
> >  static uint32_t synchro;
> >
> >  /* number of objects in one bulk operation (get or put) */
> > -static unsigned n_get_bulk;
> > -static unsigned n_put_bulk;
> > +static int n_get_bulk;
> > +static int n_put_bulk;
> >
> >  /* number of objects retrieved from mempool before putting them back
> */
> > -static unsigned n_keep;
> > +static int n_keep;
> >
> >  /* number of enqueues / dequeues */
> >  struct mempool_test_stats {
> > @@ -104,20 +110,43 @@ static struct mempool_test_stats
> stats[RTE_MAX_LCORE];
> >   */
> >  static void
> >  my_obj_init(struct rte_mempool *mp, __rte_unused void *arg,
> > -	    void *obj, unsigned i)
> > +	    void *obj, unsigned int i)
> >  {
> >  	uint32_t *objnum = obj;
> >  	memset(obj, 0, mp->elt_size);
> >  	*objnum = i;
> >  }
> >
> > +#define test_loop(x_keep, x_get_bulk, x_put_bulk)		       \
> > +		for (i = 0; likely(i < (N/x_keep)); i++) {	      \
> > +			/* get x_keep objects by bulk of x_get_bulk */  \
> > +			for (idx = 0; idx < x_keep; idx += x_get_bulk) {\
> > +				ret = rte_mempool_generic_get(mp,       \
> > +						&obj_table[idx],	\
> > +						x_get_bulk,	     \
> > +						cache);		 \
> > +				if (unlikely(ret < 0)) {		\
> > +					rte_mempool_dump(stdout, mp);   \
> > +					GOTO_ERR(ret, out);	     \
> > +				}				       \
> > +			}					       \
> > +									\
> > +			/* put the objects back by bulk of x_put_bulk */\
> > +			for (idx = 0; idx < x_keep; idx += x_put_bulk) {\
> > +				rte_mempool_generic_put(mp,	     \
> > +						&obj_table[idx],	\
> > +						x_put_bulk,	     \
> > +						cache);		 \
> > +			}					       \
> > +		}
> > +
> 
> I think a static __rte_always_inline function would do the job and is
> clearer. Something like this:
> 
> static __rte_always_inline int
> test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
> 	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int
> x_put_bulk)
> {
> 	void *obj_table[MAX_KEEP] __rte_cache_aligned;
> 	unsigned int idx;
> 	unsigned int i;
> 	int ret;
> 
> 	for (i = 0; likely(i < (N / x_keep)); i++) {
> 		/* get x_keep objects by bulk of x_get_bulk */
> 		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
> 			ret = rte_mempool_generic_get(mp,
> 						      &obj_table[idx],
> 						      x_get_bulk,
> 						      cache);
> 			if (unlikely(ret < 0)) {
> 				rte_mempool_dump(stdout, mp);
> 				return ret;
> 			}
> 		}
> 
> 		/* put the objects back by bulk of x_put_bulk */
> 		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
> 			rte_mempool_generic_put(mp,
> 						&obj_table[idx],
> 						x_put_bulk,
> 						cache);
> 		}
> 	}
> 
> 	return 0;
> }
> 
> 
> >  static int
> >  per_lcore_mempool_test(void *arg)
> >  {
> > -	void *obj_table[MAX_KEEP];
> > -	unsigned i, idx;
> > +	void *obj_table[MAX_KEEP] __rte_cache_aligned;
> > +	int i, idx;
> >  	struct rte_mempool *mp = arg;
> > -	unsigned lcore_id = rte_lcore_id();
> > +	unsigned int lcore_id = rte_lcore_id();
> >  	int ret = 0;
> >  	uint64_t start_cycles, end_cycles;
> >  	uint64_t time_diff = 0, hz = rte_get_timer_hz();
> > @@ -139,6 +168,9 @@ per_lcore_mempool_test(void *arg)
> >  		GOTO_ERR(ret, out);
> >  	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
> >  		GOTO_ERR(ret, out);
> > +	/* for constant n, n_get_bulk and n_put_bulk must be the same */
> > +	if (n_get_bulk < 0 && n_put_bulk != n_get_bulk)
> > +		GOTO_ERR(ret, out);
> >
> >  	stats[lcore_id].enq_count = 0;
> >
> > @@ -149,30 +181,18 @@ per_lcore_mempool_test(void *arg)
> >  	start_cycles = rte_get_timer_cycles();
> >
> >  	while (time_diff/hz < TIME_S) {
> > -		for (i = 0; likely(i < (N/n_keep)); i++) {
> > -			/* get n_keep objects by bulk of n_bulk */
> > -			idx = 0;
> > -			while (idx < n_keep) {
> > -				ret = rte_mempool_generic_get(mp,
> > -							      &obj_table[idx],
> > -							      n_get_bulk,
> > -							      cache);
> > -				if (unlikely(ret < 0)) {
> > -					rte_mempool_dump(stdout, mp);
> > -					/* in this case, objects are lost... */
> > -					GOTO_ERR(ret, out);
> > -				}
> > -				idx += n_get_bulk;
> > -			}
> > -
> > -			/* put the objects back */
> > -			idx = 0;
> > -			while (idx < n_keep) {
> > -				rte_mempool_generic_put(mp, &obj_table[idx],
> > -							n_put_bulk,
> > -							cache);
> > -				idx += n_put_bulk;
> > -			}
> > +		if (n_get_bulk > 0) {
> > +			test_loop(n_keep, n_get_bulk, n_put_bulk);
> > +		} else if (n_get_bulk == -1) {
> > +			test_loop(-n_keep, 1, 1);
> > +		} else if (n_get_bulk == -4) {
> > +			test_loop(-n_keep, 4, 4);
> > +		} else if (n_get_bulk == -(int)CACHE_LINE_BURST) {
> > +			test_loop(-n_keep, CACHE_LINE_BURST,
> CACHE_LINE_BURST);
> > +		} else if (n_get_bulk == -32) {
> > +			test_loop(-n_keep, 32, 32);
> > +		} else {
> > +			GOTO_ERR(ret, out);
> >  		}
> >  		end_cycles = rte_get_timer_cycles();
> >  		time_diff = end_cycles - start_cycles;
> 
> I'm not convinced that having negative values to mean "constant" is
> clear. I'd prefer to have another global variable
> "use_constant_values",
> which would give something like this:
> 
> 		if (!use_constant_values)
> 			ret = test_loop(mp, cache, n_keep, n_get_bulk,
> n_put_bulk);
> 		else if (n_get_bulk == 1)
> 			ret = test_loop(mp, cache, n_keep, 1, 1);
> 		else if (n_get_bulk == 4)
> 			ret = test_loop(mp, cache, n_keep, 4, 4);
> 		else if (n_get_bulk == CACHE_LINE_BURST)
> 			ret = test_loop(mp, cache, n_keep,
> 					CACHE_LINE_BURST, CACHE_LINE_BURST);
> 		else if (n_get_bulk == 32)
> 			ret = test_loop(mp, cache, n_keep, 32, 32);
> 		else
> 			ret = -1;
> 
> 		if (ret < 0)
> 			GOTO_ERR(ret, out);
> 
> 
> > @@ -192,10 +212,10 @@ per_lcore_mempool_test(void *arg)
> >  static int
> >  launch_cores(struct rte_mempool *mp, unsigned int cores)
> >  {
> > -	unsigned lcore_id;
> > +	unsigned int lcore_id;
> >  	uint64_t rate;
> >  	int ret;
> > -	unsigned cores_save = cores;
> > +	unsigned int cores_save = cores;
> >
> >  	__atomic_store_n(&synchro, 0, __ATOMIC_RELAXED);
> >
> > @@ -203,10 +223,14 @@ launch_cores(struct rte_mempool *mp, unsigned
> int cores)
> >  	memset(stats, 0, sizeof(stats));
> >
> >  	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
> > -	       "n_put_bulk=%u n_keep=%u ",
> > +	       "n_put_bulk=%u n_keep=%u constant_n=%s ",
> >  	       use_external_cache ?
> > -		   external_cache_size : (unsigned) mp->cache_size,
> > -	       cores, n_get_bulk, n_put_bulk, n_keep);
> > +		   external_cache_size : (unsigned int) mp->cache_size,
> > +	       cores,
> > +	       n_get_bulk > 0 ? n_get_bulk : -n_get_bulk,
> > +	       n_put_bulk > 0 ? n_put_bulk : -n_put_bulk,
> > +	       n_keep > 0 ? n_keep : -n_keep,
> > +	       n_get_bulk > 0 ? "false" : "true");
> >
> 
> This would become much simpler with this new variable.
> 
> >  	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
> >  		printf("mempool is not full\n");
> > @@ -253,12 +277,13 @@ launch_cores(struct rte_mempool *mp, unsigned
> int cores)
> >  static int
> >  do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
> >  {
> > -	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
> > -	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
> > -	unsigned keep_tab[] = { 32, 128, 0 };
> > -	unsigned *get_bulk_ptr;
> > -	unsigned *put_bulk_ptr;
> > -	unsigned *keep_ptr;
> > +	/* Negative n_get_bulk values represent constants in the test. */
> > +	int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, -1, -4, -
> (int)CACHE_LINE_BURST, -32, 0 };
> > +	int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> > +	int keep_tab[] = { 32, 128, 512, 0 };
> > +	int *get_bulk_ptr;
> > +	int *put_bulk_ptr;
> > +	int *keep_ptr;
> >  	int ret;
> >
> 
> Same here, changes would be minimal.
> 
> >  	for (get_bulk_ptr = bulk_tab_get; *get_bulk_ptr; get_bulk_ptr++)
> {
> > @@ -266,13 +291,16 @@ do_one_mempool_test(struct rte_mempool *mp,
> unsigned int cores)
> >  			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
> >
> >  				n_get_bulk = *get_bulk_ptr;
> > -				n_put_bulk = *put_bulk_ptr;
> > -				n_keep = *keep_ptr;
> > +				n_put_bulk = n_get_bulk > 0 ? *put_bulk_ptr :
> n_get_bulk;
> > +				n_keep = n_get_bulk > 0 ? *keep_ptr : -
> *keep_ptr;
> >  				ret = launch_cores(mp, cores);
> >
> >  				if (ret < 0)
> >  					return -1;
> 
> No change would be required above (except use_constant_values = 0).
> And below, we could simply add instead:
> 
> 				/* replay test with constant values */
> 				if (n_get_bulk == n_put_bulk) {
> 					use_constant_values = 1;
> 					ret = launch_cores(mp, cores);
> 					if (ret < 0)
> 						return -1;
> 				}
> 
> 
> 
> If you are ok with the proposed changes, I can directly submit a v2,
> since I already made them locally.

Please do! No need do the work one more time. :-)

> 
> Thanks,
> Olivier


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] mempool: test performance with constant n
  2022-01-19 11:37 [PATCH] mempool: test performance with constant n Morten Brørup
  2022-01-24 10:26 ` Olivier Matz
@ 2022-01-24 14:53 ` Olivier Matz
  2022-01-24 14:57   ` Olivier Matz
  2022-01-24 14:59 ` [PATCH v2] " Olivier Matz
  2 siblings, 1 reply; 9+ messages in thread
From: Olivier Matz @ 2022-01-24 14:53 UTC (permalink / raw)
  To: dev
  Cc: mb, andrew.rybchenko, bruce.richardson, jerinjacobk,
	olivier.matz, thomas

From: Morten Brørup <mb@smartsharesystems.com>

"What gets measured gets done."

This patch adds mempool performance tests where the number of objects to
put and get is constant at compile time, which may significantly improve
the performance of these functions. [*]

Also, it is ensured that the array holding the object used for testing
is cache line aligned, for maximum performance.

And finally, the following entries are added to the list of tests:
- Number of kept objects: 512
- Number of objects to get and to put: The number of pointers fitting
  into a cache line, i.e. 8 or 16

[*] Some example performance test (with cache) results:

get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462

get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643

get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---

Hi Morten,

Here is the updated patch.

I launched the mempool_perf on my desktop machine, but I don't reproduce the numbers: constant or
non-constant give almost the same rate on my machine (it's even worst with constants). I tested with
your initial patch and with this one. Can you please try this patch, and/or give some details about
your test environment? Here is what I get:

with your patch:
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=152620236
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=144716595
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=306996838
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=287375359
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=977626723
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=963103944

with this patch:
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=156460646
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=142173798
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=312410111
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=281699942
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=983315247
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=950350638


v2:
- use a flag instead of a negative value to enable tests with
  compile-time constant
- use a static inline function instead of a macro
- remove some "noise" (do not change variable type when not required)


Thanks,
Olivier

 app/test/test_mempool_perf.c | 110 ++++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 33 deletions(-)

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index 87ad251367..ce7c6241ab 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
  */

 #include <string.h>
@@ -55,19 +56,24 @@
  *
  *      - Bulk get from 1 to 32
  *      - Bulk put from 1 to 32
+ *      - Bulk get and put from 1 to 32, compile time constant
  *
  *    - Number of kept objects (*n_keep*)
  *
  *      - 32
  *      - 128
+ *      - 512
  */

 #define N 65536
 #define TIME_S 5
 #define MEMPOOL_ELT_SIZE 2048
-#define MAX_KEEP 128
+#define MAX_KEEP 512
 #define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)

+/* Number of pointers fitting into one cache line. */
+#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
+
 #define LOG_ERR() printf("test failed at %s():%d\n", __func__, __LINE__)
 #define RET_ERR() do {							\
 		LOG_ERR();						\
@@ -91,6 +97,9 @@ static unsigned n_put_bulk;
 /* number of objects retrieved from mempool before putting them back */
 static unsigned n_keep;

+/* true if we want to test with constant n_get_bulk and n_put_bulk */
+static int use_constant_values;
+
 /* number of enqueues / dequeues */
 struct mempool_test_stats {
 	uint64_t enq_count;
@@ -111,11 +120,43 @@ my_obj_init(struct rte_mempool *mp, __rte_unused void *arg,
 	*objnum = i;
 }

+static __rte_always_inline int
+test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
+	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int x_put_bulk)
+{
+	void *obj_table[MAX_KEEP] __rte_cache_aligned;
+	unsigned int idx;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; likely(i < (N / x_keep)); i++) {
+		/* get x_keep objects by bulk of x_get_bulk */
+		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
+			ret = rte_mempool_generic_get(mp,
+						      &obj_table[idx],
+						      x_get_bulk,
+						      cache);
+			if (unlikely(ret < 0)) {
+				rte_mempool_dump(stdout, mp);
+				return ret;
+			}
+		}
+
+		/* put the objects back by bulk of x_put_bulk */
+		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
+			rte_mempool_generic_put(mp,
+						&obj_table[idx],
+						x_put_bulk,
+						cache);
+		}
+	}
+
+	return 0;
+}
+
 static int
 per_lcore_mempool_test(void *arg)
 {
-	void *obj_table[MAX_KEEP];
-	unsigned i, idx;
 	struct rte_mempool *mp = arg;
 	unsigned lcore_id = rte_lcore_id();
 	int ret = 0;
@@ -139,6 +180,9 @@ per_lcore_mempool_test(void *arg)
 		GOTO_ERR(ret, out);
 	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
 		GOTO_ERR(ret, out);
+	/* for constant n, n_get_bulk and n_put_bulk must be the same */
+	if (use_constant_values && n_put_bulk != n_get_bulk)
+		GOTO_ERR(ret, out);

 	stats[lcore_id].enq_count = 0;

@@ -149,31 +193,23 @@ per_lcore_mempool_test(void *arg)
 	start_cycles = rte_get_timer_cycles();

 	while (time_diff/hz < TIME_S) {
-		for (i = 0; likely(i < (N/n_keep)); i++) {
-			/* get n_keep objects by bulk of n_bulk */
-			idx = 0;
-			while (idx < n_keep) {
-				ret = rte_mempool_generic_get(mp,
-							      &obj_table[idx],
-							      n_get_bulk,
-							      cache);
-				if (unlikely(ret < 0)) {
-					rte_mempool_dump(stdout, mp);
-					/* in this case, objects are lost... */
-					GOTO_ERR(ret, out);
-				}
-				idx += n_get_bulk;
-			}
+		if (!use_constant_values)
+			ret = test_loop(mp, cache, n_keep, n_get_bulk, n_put_bulk);
+		else if (n_get_bulk == 1)
+			ret = test_loop(mp, cache, n_keep, 1, 1);
+		else if (n_get_bulk == 4)
+			ret = test_loop(mp, cache, n_keep, 4, 4);
+		else if (n_get_bulk == CACHE_LINE_BURST)
+			ret = test_loop(mp, cache, n_keep,
+					CACHE_LINE_BURST, CACHE_LINE_BURST);
+		else if (n_get_bulk == 32)
+			ret = test_loop(mp, cache, n_keep, 32, 32);
+		else
+			ret = -1;
+
+		if (ret < 0)
+			GOTO_ERR(ret, out);

-			/* put the objects back */
-			idx = 0;
-			while (idx < n_keep) {
-				rte_mempool_generic_put(mp, &obj_table[idx],
-							n_put_bulk,
-							cache);
-				idx += n_put_bulk;
-			}
-		}
 		end_cycles = rte_get_timer_cycles();
 		time_diff = end_cycles - start_cycles;
 		stats[lcore_id].enq_count += N;
@@ -203,10 +239,10 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 	memset(stats, 0, sizeof(stats));

 	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
-	       "n_put_bulk=%u n_keep=%u ",
+	       "n_put_bulk=%u n_keep=%u constant_n=%u ",
 	       use_external_cache ?
 		   external_cache_size : (unsigned) mp->cache_size,
-	       cores, n_get_bulk, n_put_bulk, n_keep);
+	       cores, n_get_bulk, n_put_bulk, n_keep, use_constant_values);

 	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
 		printf("mempool is not full\n");
@@ -253,9 +289,9 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 static int
 do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 {
-	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
-	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
-	unsigned keep_tab[] = { 32, 128, 0 };
+	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
+	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
+	unsigned int keep_tab[] = { 32, 128, 512, 0 };
 	unsigned *get_bulk_ptr;
 	unsigned *put_bulk_ptr;
 	unsigned *keep_ptr;
@@ -265,13 +301,21 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 		for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++) {
 			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {

+				use_constant_values = 0;
 				n_get_bulk = *get_bulk_ptr;
 				n_put_bulk = *put_bulk_ptr;
 				n_keep = *keep_ptr;
 				ret = launch_cores(mp, cores);
-
 				if (ret < 0)
 					return -1;
+
+				/* replay test with constant values */
+				if (n_get_bulk == n_put_bulk) {
+					use_constant_values = 1;
+					ret = launch_cores(mp, cores);
+					if (ret < 0)
+						return -1;
+				}
 			}
 		}
 	}
--
2.30.2

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mempool: test performance with constant n
  2022-01-24 14:53 ` Olivier Matz
@ 2022-01-24 14:57   ` Olivier Matz
  0 siblings, 0 replies; 9+ messages in thread
From: Olivier Matz @ 2022-01-24 14:57 UTC (permalink / raw)
  To: dev; +Cc: mb, andrew.rybchenko, bruce.richardson, jerinjacobk, thomas

On Mon, Jan 24, 2022 at 03:53:09PM +0100, Olivier Matz wrote:
> From: Morten Brørup <mb@smartsharesystems.com>
> 
> "What gets measured gets done."
> 
> This patch adds mempool performance tests where the number of objects to
> put and get is constant at compile time, which may significantly improve
> the performance of these functions. [*]
> 
> Also, it is ensured that the array holding the object used for testing
> is cache line aligned, for maximum performance.
> 
> And finally, the following entries are added to the list of tests:
> - Number of kept objects: 512
> - Number of objects to get and to put: The number of pointers fitting
>   into a cache line, i.e. 8 or 16
> 
> [*] Some example performance test (with cache) results:
> 
> get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
> get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462
> 
> get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
> get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643
> 
> get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
> get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

Sorry, wrong title, "v2" is missing. Please ignore, I'm resending it.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2] mempool: test performance with constant n
  2022-01-19 11:37 [PATCH] mempool: test performance with constant n Morten Brørup
  2022-01-24 10:26 ` Olivier Matz
  2022-01-24 14:53 ` Olivier Matz
@ 2022-01-24 14:59 ` Olivier Matz
  2022-01-24 17:20   ` Morten Brørup
  2022-02-02 22:39   ` Thomas Monjalon
  2 siblings, 2 replies; 9+ messages in thread
From: Olivier Matz @ 2022-01-24 14:59 UTC (permalink / raw)
  To: dev
  Cc: mb, andrew.rybchenko, bruce.richardson, jerinjacobk,
	olivier.matz, thomas

From: Morten Brørup <mb@smartsharesystems.com>

"What gets measured gets done."

This patch adds mempool performance tests where the number of objects to
put and get is constant at compile time, which may significantly improve
the performance of these functions. [*]

Also, it is ensured that the array holding the object used for testing
is cache line aligned, for maximum performance.

And finally, the following entries are added to the list of tests:
- Number of kept objects: 512
- Number of objects to get and to put: The number of pointers fitting
  into a cache line, i.e. 8 or 16

[*] Some example performance test (with cache) results:

get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462

get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643

get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---

Hi Morten,

Here is the updated patch.

I launched the mempool_perf on my desktop machine, but I don't reproduce the numbers: constant or
non-constant give almost the same rate on my machine (it's even worst with constants). I tested with
your initial patch and with this one. Can you please try this patch, and/or give some details about
your test environment? Here is what I get:

with your patch:
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=152620236
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=144716595
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=306996838
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=287375359
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=977626723
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=963103944

with this patch:
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=156460646
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=142173798
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=312410111
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=281699942
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=983315247
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=950350638


v2:
- use a flag instead of a negative value to enable tests with
  compile-time constant
- use a static inline function instead of a macro
- remove some "noise" (do not change variable type when not required)


Thanks,
Olivier


 app/test/test_mempool_perf.c | 110 ++++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 33 deletions(-)

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index 87ad251367..ce7c6241ab 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2022 SmartShare Systems
  */

 #include <string.h>
@@ -55,19 +56,24 @@
  *
  *      - Bulk get from 1 to 32
  *      - Bulk put from 1 to 32
+ *      - Bulk get and put from 1 to 32, compile time constant
  *
  *    - Number of kept objects (*n_keep*)
  *
  *      - 32
  *      - 128
+ *      - 512
  */

 #define N 65536
 #define TIME_S 5
 #define MEMPOOL_ELT_SIZE 2048
-#define MAX_KEEP 128
+#define MAX_KEEP 512
 #define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)

+/* Number of pointers fitting into one cache line. */
+#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
+
 #define LOG_ERR() printf("test failed at %s():%d\n", __func__, __LINE__)
 #define RET_ERR() do {							\
 		LOG_ERR();						\
@@ -91,6 +97,9 @@ static unsigned n_put_bulk;
 /* number of objects retrieved from mempool before putting them back */
 static unsigned n_keep;

+/* true if we want to test with constant n_get_bulk and n_put_bulk */
+static int use_constant_values;
+
 /* number of enqueues / dequeues */
 struct mempool_test_stats {
 	uint64_t enq_count;
@@ -111,11 +120,43 @@ my_obj_init(struct rte_mempool *mp, __rte_unused void *arg,
 	*objnum = i;
 }

+static __rte_always_inline int
+test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
+	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int x_put_bulk)
+{
+	void *obj_table[MAX_KEEP] __rte_cache_aligned;
+	unsigned int idx;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; likely(i < (N / x_keep)); i++) {
+		/* get x_keep objects by bulk of x_get_bulk */
+		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
+			ret = rte_mempool_generic_get(mp,
+						      &obj_table[idx],
+						      x_get_bulk,
+						      cache);
+			if (unlikely(ret < 0)) {
+				rte_mempool_dump(stdout, mp);
+				return ret;
+			}
+		}
+
+		/* put the objects back by bulk of x_put_bulk */
+		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
+			rte_mempool_generic_put(mp,
+						&obj_table[idx],
+						x_put_bulk,
+						cache);
+		}
+	}
+
+	return 0;
+}
+
 static int
 per_lcore_mempool_test(void *arg)
 {
-	void *obj_table[MAX_KEEP];
-	unsigned i, idx;
 	struct rte_mempool *mp = arg;
 	unsigned lcore_id = rte_lcore_id();
 	int ret = 0;
@@ -139,6 +180,9 @@ per_lcore_mempool_test(void *arg)
 		GOTO_ERR(ret, out);
 	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
 		GOTO_ERR(ret, out);
+	/* for constant n, n_get_bulk and n_put_bulk must be the same */
+	if (use_constant_values && n_put_bulk != n_get_bulk)
+		GOTO_ERR(ret, out);

 	stats[lcore_id].enq_count = 0;

@@ -149,31 +193,23 @@ per_lcore_mempool_test(void *arg)
 	start_cycles = rte_get_timer_cycles();

 	while (time_diff/hz < TIME_S) {
-		for (i = 0; likely(i < (N/n_keep)); i++) {
-			/* get n_keep objects by bulk of n_bulk */
-			idx = 0;
-			while (idx < n_keep) {
-				ret = rte_mempool_generic_get(mp,
-							      &obj_table[idx],
-							      n_get_bulk,
-							      cache);
-				if (unlikely(ret < 0)) {
-					rte_mempool_dump(stdout, mp);
-					/* in this case, objects are lost... */
-					GOTO_ERR(ret, out);
-				}
-				idx += n_get_bulk;
-			}
+		if (!use_constant_values)
+			ret = test_loop(mp, cache, n_keep, n_get_bulk, n_put_bulk);
+		else if (n_get_bulk == 1)
+			ret = test_loop(mp, cache, n_keep, 1, 1);
+		else if (n_get_bulk == 4)
+			ret = test_loop(mp, cache, n_keep, 4, 4);
+		else if (n_get_bulk == CACHE_LINE_BURST)
+			ret = test_loop(mp, cache, n_keep,
+					CACHE_LINE_BURST, CACHE_LINE_BURST);
+		else if (n_get_bulk == 32)
+			ret = test_loop(mp, cache, n_keep, 32, 32);
+		else
+			ret = -1;
+
+		if (ret < 0)
+			GOTO_ERR(ret, out);

-			/* put the objects back */
-			idx = 0;
-			while (idx < n_keep) {
-				rte_mempool_generic_put(mp, &obj_table[idx],
-							n_put_bulk,
-							cache);
-				idx += n_put_bulk;
-			}
-		}
 		end_cycles = rte_get_timer_cycles();
 		time_diff = end_cycles - start_cycles;
 		stats[lcore_id].enq_count += N;
@@ -203,10 +239,10 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 	memset(stats, 0, sizeof(stats));

 	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
-	       "n_put_bulk=%u n_keep=%u ",
+	       "n_put_bulk=%u n_keep=%u constant_n=%u ",
 	       use_external_cache ?
 		   external_cache_size : (unsigned) mp->cache_size,
-	       cores, n_get_bulk, n_put_bulk, n_keep);
+	       cores, n_get_bulk, n_put_bulk, n_keep, use_constant_values);

 	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
 		printf("mempool is not full\n");
@@ -253,9 +289,9 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
 static int
 do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 {
-	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
-	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
-	unsigned keep_tab[] = { 32, 128, 0 };
+	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
+	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
+	unsigned int keep_tab[] = { 32, 128, 512, 0 };
 	unsigned *get_bulk_ptr;
 	unsigned *put_bulk_ptr;
 	unsigned *keep_ptr;
@@ -265,13 +301,21 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
 		for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++) {
 			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {

+				use_constant_values = 0;
 				n_get_bulk = *get_bulk_ptr;
 				n_put_bulk = *put_bulk_ptr;
 				n_keep = *keep_ptr;
 				ret = launch_cores(mp, cores);
-
 				if (ret < 0)
 					return -1;
+
+				/* replay test with constant values */
+				if (n_get_bulk == n_put_bulk) {
+					use_constant_values = 1;
+					ret = launch_cores(mp, cores);
+					if (ret < 0)
+						return -1;
+				}
 			}
 		}
 	}
--
2.30.2

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH v2] mempool: test performance with constant n
  2022-01-24 14:59 ` [PATCH v2] " Olivier Matz
@ 2022-01-24 17:20   ` Morten Brørup
  2022-01-25 12:56     ` Olivier Matz
  2022-02-02 22:39   ` Thomas Monjalon
  1 sibling, 1 reply; 9+ messages in thread
From: Morten Brørup @ 2022-01-24 17:20 UTC (permalink / raw)
  To: Olivier Matz, dev; +Cc: andrew.rybchenko, bruce.richardson, jerinjacobk, thomas

> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Monday, 24 January 2022 16.00
> 
> From: Morten Brørup <mb@smartsharesystems.com>
> 
> "What gets measured gets done."
> 
> This patch adds mempool performance tests where the number of objects
> to
> put and get is constant at compile time, which may significantly
> improve
> the performance of these functions. [*]
> 
> Also, it is ensured that the array holding the object used for testing
> is cache line aligned, for maximum performance.
> 
> And finally, the following entries are added to the list of tests:
> - Number of kept objects: 512
> - Number of objects to get and to put: The number of pointers fitting
>   into a cache line, i.e. 8 or 16
> 
> [*] Some example performance test (with cache) results:
> 
> get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
> get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462
> 
> get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
> get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643
> 
> get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
> get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
> 
> Hi Morten,
> 
> Here is the updated patch.
> 
> I launched the mempool_perf on my desktop machine, but I don't
> reproduce the numbers: constant or
> non-constant give almost the same rate on my machine (it's even worst
> with constants). I tested with
> your initial patch and with this one. Can you please try this patch,
> and/or give some details about
> your test environment?

Test environment:
VMware virtual machine running Ubuntu 20.04.3 LTS.
4 CPUs and 8 GB RAM assigned.
The physical CPU is a Xeon E5-2620 v4 with plenty of RAM.
Although other VMs are running on the same server, it is not very oversubscribed.

Hugepages established with:
usertools/dpdk-hugepages.py -p 2M --setup 2G

Build steps:
meson -Dplatform=generic work
cd work
ninja

> Here is what I get:
> 
> with your patch:
> mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=false rate_persec=152620236
> mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=true rate_persec=144716595
> mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=false rate_persec=306996838
> mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=true rate_persec=287375359
> mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> n_keep=128 constant_n=false rate_persec=977626723
> mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> n_keep=128 constant_n=true rate_persec=963103944

My test results were with an experimental, optimized version of the mempool library, which showed a larger difference. (This was the reason for updating the perf test - to measure the effects of optimizing the mempool library.)

However, testing the patch (version 1) with a brand new git checkout still shows a huge difference, e.g.:

mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=501009612
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=799014912

You should also see a significant difference when testing.

My rate_persec without constant n is 3 x yours (501 M vs. 156 M ops/s), so the baseline seems wrong! I don't think our server rig is so much faster than your desktop machine. Perhaps mempool debug, telemetry or other background noise is polluting your test.

> 
> with this patch:
> mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=0 rate_persec=156460646
> mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=1 rate_persec=142173798
> mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=0 rate_persec=312410111
> mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> constant_n=1 rate_persec=281699942
> mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> n_keep=128 constant_n=0 rate_persec=983315247
> mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> n_keep=128 constant_n=1 rate_persec=950350638
> 
> 
> v2:
> - use a flag instead of a negative value to enable tests with
>   compile-time constant
> - use a static inline function instead of a macro
> - remove some "noise" (do not change variable type when not required)
> 
> 
> Thanks,
> Olivier
> 
> 
>  app/test/test_mempool_perf.c | 110 ++++++++++++++++++++++++-----------
>  1 file changed, 77 insertions(+), 33 deletions(-)
> 
> diff --git a/app/test/test_mempool_perf.c
> b/app/test/test_mempool_perf.c
> index 87ad251367..ce7c6241ab 100644
> --- a/app/test/test_mempool_perf.c
> +++ b/app/test/test_mempool_perf.c
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2022 SmartShare Systems
>   */
> 
>  #include <string.h>
> @@ -55,19 +56,24 @@
>   *
>   *      - Bulk get from 1 to 32
>   *      - Bulk put from 1 to 32
> + *      - Bulk get and put from 1 to 32, compile time constant
>   *
>   *    - Number of kept objects (*n_keep*)
>   *
>   *      - 32
>   *      - 128
> + *      - 512
>   */
> 
>  #define N 65536
>  #define TIME_S 5
>  #define MEMPOOL_ELT_SIZE 2048
> -#define MAX_KEEP 128
> +#define MAX_KEEP 512
>  #define MEMPOOL_SIZE
> ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
> 
> +/* Number of pointers fitting into one cache line. */
> +#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
> +
>  #define LOG_ERR() printf("test failed at %s():%d\n", __func__,
> __LINE__)
>  #define RET_ERR() do {							\
>  		LOG_ERR();						\
> @@ -91,6 +97,9 @@ static unsigned n_put_bulk;
>  /* number of objects retrieved from mempool before putting them back
> */
>  static unsigned n_keep;
> 
> +/* true if we want to test with constant n_get_bulk and n_put_bulk */
> +static int use_constant_values;
> +
>  /* number of enqueues / dequeues */
>  struct mempool_test_stats {
>  	uint64_t enq_count;
> @@ -111,11 +120,43 @@ my_obj_init(struct rte_mempool *mp, __rte_unused
> void *arg,
>  	*objnum = i;
>  }
> 
> +static __rte_always_inline int
> +test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
> +	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int
> x_put_bulk)
> +{
> +	void *obj_table[MAX_KEEP] __rte_cache_aligned;
> +	unsigned int idx;
> +	unsigned int i;
> +	int ret;
> +
> +	for (i = 0; likely(i < (N / x_keep)); i++) {
> +		/* get x_keep objects by bulk of x_get_bulk */
> +		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
> +			ret = rte_mempool_generic_get(mp,
> +						      &obj_table[idx],
> +						      x_get_bulk,
> +						      cache);
> +			if (unlikely(ret < 0)) {
> +				rte_mempool_dump(stdout, mp);
> +				return ret;
> +			}
> +		}
> +
> +		/* put the objects back by bulk of x_put_bulk */
> +		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
> +			rte_mempool_generic_put(mp,
> +						&obj_table[idx],
> +						x_put_bulk,
> +						cache);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int
>  per_lcore_mempool_test(void *arg)
>  {
> -	void *obj_table[MAX_KEEP];
> -	unsigned i, idx;
>  	struct rte_mempool *mp = arg;
>  	unsigned lcore_id = rte_lcore_id();
>  	int ret = 0;
> @@ -139,6 +180,9 @@ per_lcore_mempool_test(void *arg)
>  		GOTO_ERR(ret, out);
>  	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
>  		GOTO_ERR(ret, out);
> +	/* for constant n, n_get_bulk and n_put_bulk must be the same */
> +	if (use_constant_values && n_put_bulk != n_get_bulk)
> +		GOTO_ERR(ret, out);
> 
>  	stats[lcore_id].enq_count = 0;
> 
> @@ -149,31 +193,23 @@ per_lcore_mempool_test(void *arg)
>  	start_cycles = rte_get_timer_cycles();
> 
>  	while (time_diff/hz < TIME_S) {
> -		for (i = 0; likely(i < (N/n_keep)); i++) {
> -			/* get n_keep objects by bulk of n_bulk */
> -			idx = 0;
> -			while (idx < n_keep) {
> -				ret = rte_mempool_generic_get(mp,
> -							      &obj_table[idx],
> -							      n_get_bulk,
> -							      cache);
> -				if (unlikely(ret < 0)) {
> -					rte_mempool_dump(stdout, mp);
> -					/* in this case, objects are lost... */
> -					GOTO_ERR(ret, out);
> -				}
> -				idx += n_get_bulk;
> -			}
> +		if (!use_constant_values)
> +			ret = test_loop(mp, cache, n_keep, n_get_bulk,
> n_put_bulk);
> +		else if (n_get_bulk == 1)
> +			ret = test_loop(mp, cache, n_keep, 1, 1);
> +		else if (n_get_bulk == 4)
> +			ret = test_loop(mp, cache, n_keep, 4, 4);
> +		else if (n_get_bulk == CACHE_LINE_BURST)
> +			ret = test_loop(mp, cache, n_keep,
> +					CACHE_LINE_BURST, CACHE_LINE_BURST);
> +		else if (n_get_bulk == 32)
> +			ret = test_loop(mp, cache, n_keep, 32, 32);
> +		else
> +			ret = -1;
> +
> +		if (ret < 0)
> +			GOTO_ERR(ret, out);
> 
> -			/* put the objects back */
> -			idx = 0;
> -			while (idx < n_keep) {
> -				rte_mempool_generic_put(mp, &obj_table[idx],
> -							n_put_bulk,
> -							cache);
> -				idx += n_put_bulk;
> -			}
> -		}
>  		end_cycles = rte_get_timer_cycles();
>  		time_diff = end_cycles - start_cycles;
>  		stats[lcore_id].enq_count += N;
> @@ -203,10 +239,10 @@ launch_cores(struct rte_mempool *mp, unsigned int
> cores)
>  	memset(stats, 0, sizeof(stats));
> 
>  	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
> -	       "n_put_bulk=%u n_keep=%u ",
> +	       "n_put_bulk=%u n_keep=%u constant_n=%u ",
>  	       use_external_cache ?
>  		   external_cache_size : (unsigned) mp->cache_size,
> -	       cores, n_get_bulk, n_put_bulk, n_keep);
> +	       cores, n_get_bulk, n_put_bulk, n_keep,
> use_constant_values);
> 
>  	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
>  		printf("mempool is not full\n");
> @@ -253,9 +289,9 @@ launch_cores(struct rte_mempool *mp, unsigned int
> cores)
>  static int
>  do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
>  {
> -	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
> -	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
> -	unsigned keep_tab[] = { 32, 128, 0 };
> +	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> +	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> +	unsigned int keep_tab[] = { 32, 128, 512, 0 };
>  	unsigned *get_bulk_ptr;
>  	unsigned *put_bulk_ptr;
>  	unsigned *keep_ptr;
> @@ -265,13 +301,21 @@ do_one_mempool_test(struct rte_mempool *mp,
> unsigned int cores)
>  		for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr;
> put_bulk_ptr++) {
>  			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
> 
> +				use_constant_values = 0;
>  				n_get_bulk = *get_bulk_ptr;
>  				n_put_bulk = *put_bulk_ptr;
>  				n_keep = *keep_ptr;
>  				ret = launch_cores(mp, cores);
> -
>  				if (ret < 0)
>  					return -1;
> +
> +				/* replay test with constant values */
> +				if (n_get_bulk == n_put_bulk) {
> +					use_constant_values = 1;
> +					ret = launch_cores(mp, cores);
> +					if (ret < 0)
> +						return -1;
> +				}
>  			}
>  		}
>  	}
> --
> 2.30.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] mempool: test performance with constant n
  2022-01-24 17:20   ` Morten Brørup
@ 2022-01-25 12:56     ` Olivier Matz
  0 siblings, 0 replies; 9+ messages in thread
From: Olivier Matz @ 2022-01-25 12:56 UTC (permalink / raw)
  To: Morten Brørup
  Cc: dev, andrew.rybchenko, bruce.richardson, jerinjacobk, thomas

Hi Morten,

On Mon, Jan 24, 2022 at 06:20:49PM +0100, Morten Brørup wrote:
> > From: Olivier Matz [mailto:olivier.matz@6wind.com]
> > Sent: Monday, 24 January 2022 16.00
> > 
> > From: Morten Brørup <mb@smartsharesystems.com>
> > 
> > "What gets measured gets done."
> > 
> > This patch adds mempool performance tests where the number of objects
> > to
> > put and get is constant at compile time, which may significantly
> > improve
> > the performance of these functions. [*]
> > 
> > Also, it is ensured that the array holding the object used for testing
> > is cache line aligned, for maximum performance.
> > 
> > And finally, the following entries are added to the list of tests:
> > - Number of kept objects: 512
> > - Number of objects to get and to put: The number of pointers fitting
> >   into a cache line, i.e. 8 or 16
> > 
> > [*] Some example performance test (with cache) results:
> > 
> > get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
> > get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462
> > 
> > get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
> > get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643
> > 
> > get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
> > get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836
> > 
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> > 
> > Hi Morten,
> > 
> > Here is the updated patch.
> > 
> > I launched the mempool_perf on my desktop machine, but I don't
> > reproduce the numbers: constant or
> > non-constant give almost the same rate on my machine (it's even worst
> > with constants). I tested with
> > your initial patch and with this one. Can you please try this patch,
> > and/or give some details about
> > your test environment?
> 
> Test environment:
> VMware virtual machine running Ubuntu 20.04.3 LTS.
> 4 CPUs and 8 GB RAM assigned.
> The physical CPU is a Xeon E5-2620 v4 with plenty of RAM.
> Although other VMs are running on the same server, it is not very oversubscribed.
> 
> Hugepages established with:
> usertools/dpdk-hugepages.py -p 2M --setup 2G
> 
> Build steps:
> meson -Dplatform=generic work
> cd work
> ninja
> 
> > Here is what I get:
> > 
> > with your patch:
> > mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=false rate_persec=152620236
> > mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=true rate_persec=144716595
> > mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=false rate_persec=306996838
> > mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=true rate_persec=287375359
> > mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> > n_keep=128 constant_n=false rate_persec=977626723
> > mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> > n_keep=128 constant_n=true rate_persec=963103944
> 
> My test results were with an experimental, optimized version of the mempool library, which showed a larger difference. (This was the reason for updating the perf test - to measure the effects of optimizing the mempool library.)
> 
> However, testing the patch (version 1) with a brand new git checkout still shows a huge difference, e.g.:
> 
> mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=false rate_persec=501009612
> mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=true rate_persec=799014912
> 
> You should also see a significant difference when testing.
> 
> My rate_persec without constant n is 3 x yours (501 M vs. 156 M ops/s), so the baseline seems wrong! I don't think our server rig is so much faster than your desktop machine. Perhaps mempool debug, telemetry or other background noise is polluting your test.

Sorry, I just realized that I was indeed using a "debugoptimzed" build.
It's much better in release mode.

mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=1425473536
mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=2159660236
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=2796342476
mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=4351577292
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=0 rate_persec=8589777300
mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8 n_keep=128 constant_n=1 rate_persec=13560971258

Thanks,
Olivier


> 
> > 
> > with this patch:
> > mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=0 rate_persec=156460646
> > mempool_autotest cache=512 cores=1 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=1 rate_persec=142173798
> > mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=0 rate_persec=312410111
> > mempool_autotest cache=512 cores=2 n_get_bulk=8 n_put_bulk=8 n_keep=128
> > constant_n=1 rate_persec=281699942
> > mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> > n_keep=128 constant_n=0 rate_persec=983315247
> > mempool_autotest cache=512 cores=12 n_get_bulk=8 n_put_bulk=8
> > n_keep=128 constant_n=1 rate_persec=950350638
> > 
> > 
> > v2:
> > - use a flag instead of a negative value to enable tests with
> >   compile-time constant
> > - use a static inline function instead of a macro
> > - remove some "noise" (do not change variable type when not required)
> > 
> > 
> > Thanks,
> > Olivier
> > 
> > 
> >  app/test/test_mempool_perf.c | 110 ++++++++++++++++++++++++-----------
> >  1 file changed, 77 insertions(+), 33 deletions(-)
> > 
> > diff --git a/app/test/test_mempool_perf.c
> > b/app/test/test_mempool_perf.c
> > index 87ad251367..ce7c6241ab 100644
> > --- a/app/test/test_mempool_perf.c
> > +++ b/app/test/test_mempool_perf.c
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2010-2014 Intel Corporation
> > + * Copyright(c) 2022 SmartShare Systems
> >   */
> > 
> >  #include <string.h>
> > @@ -55,19 +56,24 @@
> >   *
> >   *      - Bulk get from 1 to 32
> >   *      - Bulk put from 1 to 32
> > + *      - Bulk get and put from 1 to 32, compile time constant
> >   *
> >   *    - Number of kept objects (*n_keep*)
> >   *
> >   *      - 32
> >   *      - 128
> > + *      - 512
> >   */
> > 
> >  #define N 65536
> >  #define TIME_S 5
> >  #define MEMPOOL_ELT_SIZE 2048
> > -#define MAX_KEEP 128
> > +#define MAX_KEEP 512
> >  #define MEMPOOL_SIZE
> > ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
> > 
> > +/* Number of pointers fitting into one cache line. */
> > +#define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
> > +
> >  #define LOG_ERR() printf("test failed at %s():%d\n", __func__,
> > __LINE__)
> >  #define RET_ERR() do {							\
> >  		LOG_ERR();						\
> > @@ -91,6 +97,9 @@ static unsigned n_put_bulk;
> >  /* number of objects retrieved from mempool before putting them back
> > */
> >  static unsigned n_keep;
> > 
> > +/* true if we want to test with constant n_get_bulk and n_put_bulk */
> > +static int use_constant_values;
> > +
> >  /* number of enqueues / dequeues */
> >  struct mempool_test_stats {
> >  	uint64_t enq_count;
> > @@ -111,11 +120,43 @@ my_obj_init(struct rte_mempool *mp, __rte_unused
> > void *arg,
> >  	*objnum = i;
> >  }
> > 
> > +static __rte_always_inline int
> > +test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
> > +	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int
> > x_put_bulk)
> > +{
> > +	void *obj_table[MAX_KEEP] __rte_cache_aligned;
> > +	unsigned int idx;
> > +	unsigned int i;
> > +	int ret;
> > +
> > +	for (i = 0; likely(i < (N / x_keep)); i++) {
> > +		/* get x_keep objects by bulk of x_get_bulk */
> > +		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
> > +			ret = rte_mempool_generic_get(mp,
> > +						      &obj_table[idx],
> > +						      x_get_bulk,
> > +						      cache);
> > +			if (unlikely(ret < 0)) {
> > +				rte_mempool_dump(stdout, mp);
> > +				return ret;
> > +			}
> > +		}
> > +
> > +		/* put the objects back by bulk of x_put_bulk */
> > +		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
> > +			rte_mempool_generic_put(mp,
> > +						&obj_table[idx],
> > +						x_put_bulk,
> > +						cache);
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int
> >  per_lcore_mempool_test(void *arg)
> >  {
> > -	void *obj_table[MAX_KEEP];
> > -	unsigned i, idx;
> >  	struct rte_mempool *mp = arg;
> >  	unsigned lcore_id = rte_lcore_id();
> >  	int ret = 0;
> > @@ -139,6 +180,9 @@ per_lcore_mempool_test(void *arg)
> >  		GOTO_ERR(ret, out);
> >  	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
> >  		GOTO_ERR(ret, out);
> > +	/* for constant n, n_get_bulk and n_put_bulk must be the same */
> > +	if (use_constant_values && n_put_bulk != n_get_bulk)
> > +		GOTO_ERR(ret, out);
> > 
> >  	stats[lcore_id].enq_count = 0;
> > 
> > @@ -149,31 +193,23 @@ per_lcore_mempool_test(void *arg)
> >  	start_cycles = rte_get_timer_cycles();
> > 
> >  	while (time_diff/hz < TIME_S) {
> > -		for (i = 0; likely(i < (N/n_keep)); i++) {
> > -			/* get n_keep objects by bulk of n_bulk */
> > -			idx = 0;
> > -			while (idx < n_keep) {
> > -				ret = rte_mempool_generic_get(mp,
> > -							      &obj_table[idx],
> > -							      n_get_bulk,
> > -							      cache);
> > -				if (unlikely(ret < 0)) {
> > -					rte_mempool_dump(stdout, mp);
> > -					/* in this case, objects are lost... */
> > -					GOTO_ERR(ret, out);
> > -				}
> > -				idx += n_get_bulk;
> > -			}
> > +		if (!use_constant_values)
> > +			ret = test_loop(mp, cache, n_keep, n_get_bulk,
> > n_put_bulk);
> > +		else if (n_get_bulk == 1)
> > +			ret = test_loop(mp, cache, n_keep, 1, 1);
> > +		else if (n_get_bulk == 4)
> > +			ret = test_loop(mp, cache, n_keep, 4, 4);
> > +		else if (n_get_bulk == CACHE_LINE_BURST)
> > +			ret = test_loop(mp, cache, n_keep,
> > +					CACHE_LINE_BURST, CACHE_LINE_BURST);
> > +		else if (n_get_bulk == 32)
> > +			ret = test_loop(mp, cache, n_keep, 32, 32);
> > +		else
> > +			ret = -1;
> > +
> > +		if (ret < 0)
> > +			GOTO_ERR(ret, out);
> > 
> > -			/* put the objects back */
> > -			idx = 0;
> > -			while (idx < n_keep) {
> > -				rte_mempool_generic_put(mp, &obj_table[idx],
> > -							n_put_bulk,
> > -							cache);
> > -				idx += n_put_bulk;
> > -			}
> > -		}
> >  		end_cycles = rte_get_timer_cycles();
> >  		time_diff = end_cycles - start_cycles;
> >  		stats[lcore_id].enq_count += N;
> > @@ -203,10 +239,10 @@ launch_cores(struct rte_mempool *mp, unsigned int
> > cores)
> >  	memset(stats, 0, sizeof(stats));
> > 
> >  	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
> > -	       "n_put_bulk=%u n_keep=%u ",
> > +	       "n_put_bulk=%u n_keep=%u constant_n=%u ",
> >  	       use_external_cache ?
> >  		   external_cache_size : (unsigned) mp->cache_size,
> > -	       cores, n_get_bulk, n_put_bulk, n_keep);
> > +	       cores, n_get_bulk, n_put_bulk, n_keep,
> > use_constant_values);
> > 
> >  	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
> >  		printf("mempool is not full\n");
> > @@ -253,9 +289,9 @@ launch_cores(struct rte_mempool *mp, unsigned int
> > cores)
> >  static int
> >  do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
> >  {
> > -	unsigned bulk_tab_get[] = { 1, 4, 32, 0 };
> > -	unsigned bulk_tab_put[] = { 1, 4, 32, 0 };
> > -	unsigned keep_tab[] = { 32, 128, 0 };
> > +	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> > +	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> > +	unsigned int keep_tab[] = { 32, 128, 512, 0 };
> >  	unsigned *get_bulk_ptr;
> >  	unsigned *put_bulk_ptr;
> >  	unsigned *keep_ptr;
> > @@ -265,13 +301,21 @@ do_one_mempool_test(struct rte_mempool *mp,
> > unsigned int cores)
> >  		for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr;
> > put_bulk_ptr++) {
> >  			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
> > 
> > +				use_constant_values = 0;
> >  				n_get_bulk = *get_bulk_ptr;
> >  				n_put_bulk = *put_bulk_ptr;
> >  				n_keep = *keep_ptr;
> >  				ret = launch_cores(mp, cores);
> > -
> >  				if (ret < 0)
> >  					return -1;
> > +
> > +				/* replay test with constant values */
> > +				if (n_get_bulk == n_put_bulk) {
> > +					use_constant_values = 1;
> > +					ret = launch_cores(mp, cores);
> > +					if (ret < 0)
> > +						return -1;
> > +				}
> >  			}
> >  		}
> >  	}
> > --
> > 2.30.2
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] mempool: test performance with constant n
  2022-01-24 14:59 ` [PATCH v2] " Olivier Matz
  2022-01-24 17:20   ` Morten Brørup
@ 2022-02-02 22:39   ` Thomas Monjalon
  1 sibling, 0 replies; 9+ messages in thread
From: Thomas Monjalon @ 2022-02-02 22:39 UTC (permalink / raw)
  To: mb, Olivier Matz; +Cc: dev, andrew.rybchenko, bruce.richardson, jerinjacobk

24/01/2022 15:59, Olivier Matz:
> From: Morten Brørup <mb@smartsharesystems.com>
> 
> "What gets measured gets done."
> 
> This patch adds mempool performance tests where the number of objects to
> put and get is constant at compile time, which may significantly improve
> the performance of these functions. [*]
> 
> Also, it is ensured that the array holding the object used for testing
> is cache line aligned, for maximum performance.
> 
> And finally, the following entries are added to the list of tests:
> - Number of kept objects: 512
> - Number of objects to get and to put: The number of pointers fitting
>   into a cache line, i.e. 8 or 16
> 
> [*] Some example performance test (with cache) results:
> 
> get_bulk=4 put_bulk=4 keep=128 constant_n=false rate_persec=280480972
> get_bulk=4 put_bulk=4 keep=128 constant_n=true  rate_persec=622159462
> 
> get_bulk=8 put_bulk=8 keep=128 constant_n=false rate_persec=477967155
> get_bulk=8 put_bulk=8 keep=128 constant_n=true  rate_persec=917582643
> 
> get_bulk=32 put_bulk=32 keep=32 constant_n=false rate_persec=871248691
> get_bulk=32 put_bulk=32 keep=32 constant_n=true rate_persec=1134021836
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

Applied, thanks.




^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-02-02 22:40 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-19 11:37 [PATCH] mempool: test performance with constant n Morten Brørup
2022-01-24 10:26 ` Olivier Matz
2022-01-24 10:37   ` Morten Brørup
2022-01-24 14:53 ` Olivier Matz
2022-01-24 14:57   ` Olivier Matz
2022-01-24 14:59 ` [PATCH v2] " Olivier Matz
2022-01-24 17:20   ` Morten Brørup
2022-01-25 12:56     ` Olivier Matz
2022-02-02 22:39   ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).