From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gage.eads@intel.com>
Received: from mga04.intel.com (mga04.intel.com [192.55.52.120])
 by dpdk.org (Postfix) with ESMTP id CBE014CA9
 for <dev@dpdk.org>; Mon,  1 Apr 2019 23:15:13 +0200 (CEST)
X-Amp-Result: SKIPPED(no attachment in message)
X-Amp-File-Uploaded: False
Received: from orsmga005.jf.intel.com ([10.7.209.41])
 by fmsmga104.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384;
 01 Apr 2019 14:15:13 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.60,298,1549958400"; d="scan'208";a="312277350"
Received: from txasoft-yocto.an.intel.com ([10.123.72.192])
 by orsmga005.jf.intel.com with ESMTP; 01 Apr 2019 14:15:12 -0700
From: Gage Eads <gage.eads@intel.com>
To: dev@dpdk.org
Cc: olivier.matz@6wind.com, arybchenko@solarflare.com,
 bruce.richardson@intel.com, konstantin.ananyev@intel.com, gavin.hu@arm.com,
 Honnappa.Nagarahalli@arm.com, nd@arm.com, thomas@monjalon.net
Date: Mon,  1 Apr 2019 16:14:25 -0500
Message-Id: <20190401211429.20282-5-gage.eads@intel.com>
X-Mailer: git-send-email 2.13.6
In-Reply-To: <20190401211429.20282-1-gage.eads@intel.com>
References: <20190401001238.17625-1-gage.eads@intel.com>
 <20190401211429.20282-1-gage.eads@intel.com>
Subject: [dpdk-dev] [PATCH v6 4/8] test/stack: add stack perf test
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
X-List-Received-Date: Mon, 01 Apr 2019 21:15:14 -0000

stack_perf_autotest tests the following with one lcore:
- Cycles to attempt to pop an empty stack
- Cycles to push then pop a single object
- Cycles to push then pop a burst of 32 objects

It also tests the cycles to push then pop a burst of 8 and 32 objects with
the following lcore combinations (if possible):
- Two hyperthreads
- Two physical cores
- Two physical cores on separate NUMA nodes
- All available lcores

Signed-off-by: Gage Eads <gage.eads@intel.com>
Reviewed-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/Makefile          |   1 +
 app/test/meson.build       |   2 +
 app/test/test_stack_perf.c | 343 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 346 insertions(+)
 create mode 100644 app/test/test_stack_perf.c

diff --git a/app/test/Makefile b/app/test/Makefile
index e5bde81af..b28bed2d4 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -91,6 +91,7 @@ endif
 SRCS-y += test_rwlock.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack.c
+SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack_perf.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer.c
 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_perf.c
diff --git a/app/test/meson.build b/app/test/meson.build
index 56ea13f53..02eb788a4 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -96,6 +96,7 @@ test_sources = files('commands.c',
 	'test_service_cores.c',
 	'test_spinlock.c',
 	'test_stack.c',
+	'test_stack_perf.c',
 	'test_string_fns.c',
 	'test_table.c',
 	'test_table_acl.c',
@@ -241,6 +242,7 @@ perf_test_names = [
         'distributor_perf_autotest',
         'ring_pmd_perf_autotest',
         'pmd_perf_autotest',
+        'stack_perf_autotest',
 ]
 
 # All test cases in driver_test_names list are non-parallel
diff --git a/app/test/test_stack_perf.c b/app/test/test_stack_perf.c
new file mode 100644
index 000000000..484370d30
--- /dev/null
+++ b/app/test/test_stack_perf.c
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <rte_stack.h>
+#include <rte_cycles.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+
+#include "test.h"
+
+#define STACK_NAME "STACK_PERF"
+#define MAX_BURST 32
+#define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
+ * constants.
+ */
+static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
+
+static rte_atomic32_t lcore_barrier;
+
+struct lcore_pair {
+	unsigned int c1;
+	unsigned int c2;
+};
+
+static int
+get_two_hyperthreads(struct lcore_pair *lcp)
+{
+	unsigned int socket[2];
+	unsigned int core[2];
+	unsigned int id[2];
+
+	RTE_LCORE_FOREACH(id[0]) {
+		RTE_LCORE_FOREACH(id[1]) {
+			if (id[0] == id[1])
+				continue;
+			core[0] = lcore_config[id[0]].core_id;
+			core[1] = lcore_config[id[1]].core_id;
+			socket[0] = lcore_config[id[0]].socket_id;
+			socket[1] = lcore_config[id[1]].socket_id;
+			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
+				lcp->c1 = id[0];
+				lcp->c2 = id[1];
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int
+get_two_cores(struct lcore_pair *lcp)
+{
+	unsigned int socket[2];
+	unsigned int core[2];
+	unsigned int id[2];
+
+	RTE_LCORE_FOREACH(id[0]) {
+		RTE_LCORE_FOREACH(id[1]) {
+			if (id[0] == id[1])
+				continue;
+			core[0] = lcore_config[id[0]].core_id;
+			core[1] = lcore_config[id[1]].core_id;
+			socket[0] = lcore_config[id[0]].socket_id;
+			socket[1] = lcore_config[id[1]].socket_id;
+			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
+				lcp->c1 = id[0];
+				lcp->c2 = id[1];
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int
+get_two_sockets(struct lcore_pair *lcp)
+{
+	unsigned int socket[2];
+	unsigned int id[2];
+
+	RTE_LCORE_FOREACH(id[0]) {
+		RTE_LCORE_FOREACH(id[1]) {
+			if (id[0] == id[1])
+				continue;
+			socket[0] = lcore_config[id[0]].socket_id;
+			socket[1] = lcore_config[id[1]].socket_id;
+			if (socket[0] != socket[1]) {
+				lcp->c1 = id[0];
+				lcp->c2 = id[1];
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+/* Measure the cycle cost of popping an empty stack. */
+static void
+test_empty_pop(struct rte_stack *s)
+{
+	unsigned int iterations = 100000000;
+	void *objs[MAX_BURST];
+	unsigned int i;
+
+	uint64_t start = rte_rdtsc();
+
+	for (i = 0; i < iterations; i++)
+		rte_stack_pop(s, objs, bulk_sizes[0]);
+
+	uint64_t end = rte_rdtsc();
+
+	printf("Stack empty pop: %.2F\n",
+	       (double)(end - start) / iterations);
+}
+
+struct thread_args {
+	struct rte_stack *s;
+	unsigned int sz;
+	double avg;
+};
+
+/* Measure the average per-pointer cycle cost of stack push and pop */
+static int
+bulk_push_pop(void *p)
+{
+	unsigned int iterations = 1000000;
+	struct thread_args *args = p;
+	void *objs[MAX_BURST] = {0};
+	unsigned int size, i;
+	struct rte_stack *s;
+
+	s = args->s;
+	size = args->sz;
+
+	rte_atomic32_sub(&lcore_barrier, 1);
+	while (rte_atomic32_read(&lcore_barrier) != 0)
+		rte_pause();
+
+	uint64_t start = rte_rdtsc();
+
+	for (i = 0; i < iterations; i++) {
+		rte_stack_push(s, objs, size);
+		rte_stack_pop(s, objs, size);
+	}
+
+	uint64_t end = rte_rdtsc();
+
+	args->avg = ((double)(end - start))/(iterations * size);
+
+	return 0;
+}
+
+/*
+ * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
+ * perf when between hyperthread siblings, cores on the same socket, and cores
+ * on different sockets.
+ */
+static void
+run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
+		 lcore_function_t fn)
+{
+	struct thread_args args[2];
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+		rte_atomic32_set(&lcore_barrier, 2);
+
+		args[0].sz = args[1].sz = bulk_sizes[i];
+		args[0].s = args[1].s = s;
+
+		if (cores->c1 == rte_get_master_lcore()) {
+			rte_eal_remote_launch(fn, &args[1], cores->c2);
+			fn(&args[0]);
+			rte_eal_wait_lcore(cores->c2);
+		} else {
+			rte_eal_remote_launch(fn, &args[0], cores->c1);
+			rte_eal_remote_launch(fn, &args[1], cores->c2);
+			rte_eal_wait_lcore(cores->c1);
+			rte_eal_wait_lcore(cores->c2);
+		}
+
+		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
+	}
+}
+
+/* Run bulk_push_pop() simultaneously on 1+ cores. */
+static void
+run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
+{
+	struct thread_args args[RTE_MAX_LCORE];
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+		unsigned int lcore_id;
+		int cnt = 0;
+		double avg;
+
+		rte_atomic32_set(&lcore_barrier, n);
+
+		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+			if (++cnt >= n)
+				break;
+
+			args[lcore_id].s = s;
+			args[lcore_id].sz = bulk_sizes[i];
+
+			if (rte_eal_remote_launch(fn, &args[lcore_id],
+						  lcore_id))
+				rte_panic("Failed to launch lcore %d\n",
+					  lcore_id);
+		}
+
+		lcore_id = rte_lcore_id();
+
+		args[lcore_id].s = s;
+		args[lcore_id].sz = bulk_sizes[i];
+
+		fn(&args[lcore_id]);
+
+		rte_eal_mp_wait_lcore();
+
+		avg = args[rte_lcore_id()].avg;
+
+		cnt = 0;
+		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+			if (++cnt >= n)
+				break;
+			avg += args[lcore_id].avg;
+		}
+
+		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+		       bulk_sizes[i], avg / n);
+	}
+}
+
+/*
+ * Measure the cycle cost of pushing and popping a single pointer on a single
+ * lcore.
+ */
+static void
+test_single_push_pop(struct rte_stack *s)
+{
+	unsigned int iterations = 16000000;
+	void *obj = NULL;
+	unsigned int i;
+
+	uint64_t start = rte_rdtsc();
+
+	for (i = 0; i < iterations; i++) {
+		rte_stack_push(s, &obj, 1);
+		rte_stack_pop(s, &obj, 1);
+	}
+
+	uint64_t end = rte_rdtsc();
+
+	printf("Average cycles per single object push/pop: %.2F\n",
+	       ((double)(end - start)) / iterations);
+}
+
+/* Measure the cycle cost of bulk pushing and popping on a single lcore. */
+static void
+test_bulk_push_pop(struct rte_stack *s)
+{
+	unsigned int iterations = 8000000;
+	void *objs[MAX_BURST];
+	unsigned int sz, i;
+
+	for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
+		uint64_t start = rte_rdtsc();
+
+		for (i = 0; i < iterations; i++) {
+			rte_stack_push(s, objs, bulk_sizes[sz]);
+			rte_stack_pop(s, objs, bulk_sizes[sz]);
+		}
+
+		uint64_t end = rte_rdtsc();
+
+		double avg = ((double)(end - start) /
+			      (iterations * bulk_sizes[sz]));
+
+		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+		       bulk_sizes[sz], avg);
+	}
+}
+
+static int
+test_stack_perf(void)
+{
+	struct lcore_pair cores;
+	struct rte_stack *s;
+
+	rte_atomic32_init(&lcore_barrier);
+
+	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), 0);
+	if (s == NULL) {
+		printf("[%s():%u] failed to create a stack\n",
+		       __func__, __LINE__);
+		return -1;
+	}
+
+	printf("### Testing single element push/pop ###\n");
+	test_single_push_pop(s);
+
+	printf("\n### Testing empty pop ###\n");
+	test_empty_pop(s);
+
+	printf("\n### Testing using a single lcore ###\n");
+	test_bulk_push_pop(s);
+
+	if (get_two_hyperthreads(&cores) == 0) {
+		printf("\n### Testing using two hyperthreads ###\n");
+		run_on_core_pair(&cores, s, bulk_push_pop);
+	}
+	if (get_two_cores(&cores) == 0) {
+		printf("\n### Testing using two physical cores ###\n");
+		run_on_core_pair(&cores, s, bulk_push_pop);
+	}
+	if (get_two_sockets(&cores) == 0) {
+		printf("\n### Testing using two NUMA nodes ###\n");
+		run_on_core_pair(&cores, s, bulk_push_pop);
+	}
+
+	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
+	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
+
+	rte_stack_free(s);
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
-- 
2.13.6

From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from dpdk.org (dpdk.org [92.243.14.124])
	by dpdk.space (Postfix) with ESMTP id 0EF5AA0679
	for <public@inbox.dpdk.org>; Mon,  1 Apr 2019 23:15:48 +0200 (CEST)
Received: from [92.243.14.124] (localhost [127.0.0.1])
	by dpdk.org (Postfix) with ESMTP id 8E5F958C4;
	Mon,  1 Apr 2019 23:15:21 +0200 (CEST)
Received: from mga04.intel.com (mga04.intel.com [192.55.52.120])
 by dpdk.org (Postfix) with ESMTP id CBE014CA9
 for <dev@dpdk.org>; Mon,  1 Apr 2019 23:15:13 +0200 (CEST)
X-Amp-Result: SKIPPED(no attachment in message)
X-Amp-File-Uploaded: False
Received: from orsmga005.jf.intel.com ([10.7.209.41])
 by fmsmga104.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384;
 01 Apr 2019 14:15:13 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.60,298,1549958400"; d="scan'208";a="312277350"
Received: from txasoft-yocto.an.intel.com ([10.123.72.192])
 by orsmga005.jf.intel.com with ESMTP; 01 Apr 2019 14:15:12 -0700
From: Gage Eads <gage.eads@intel.com>
To: dev@dpdk.org
Cc: olivier.matz@6wind.com, arybchenko@solarflare.com,
 bruce.richardson@intel.com, konstantin.ananyev@intel.com, gavin.hu@arm.com,
 Honnappa.Nagarahalli@arm.com, nd@arm.com, thomas@monjalon.net
Date: Mon,  1 Apr 2019 16:14:25 -0500
Message-Id: <20190401211429.20282-5-gage.eads@intel.com>
X-Mailer: git-send-email 2.13.6
In-Reply-To: <20190401211429.20282-1-gage.eads@intel.com>
References: <20190401001238.17625-1-gage.eads@intel.com>
 <20190401211429.20282-1-gage.eads@intel.com>
Subject: [dpdk-dev] [PATCH v6 4/8] test/stack: add stack perf test
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>
Content-Type: text/plain; charset="UTF-8"
Message-ID: <20190401211425.EB6-IQqLgqFrBOEnDEgx_rYn70HAqWCMwTjBEjlJ3lI@z>

stack_perf_autotest tests the following with one lcore:
- Cycles to attempt to pop an empty stack
- Cycles to push then pop a single object
- Cycles to push then pop a burst of 32 objects

It also tests the cycles to push then pop a burst of 8 and 32 objects with
the following lcore combinations (if possible):
- Two hyperthreads
- Two physical cores
- Two physical cores on separate NUMA nodes
- All available lcores

Signed-off-by: Gage Eads <gage.eads@intel.com>
Reviewed-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/Makefile          |   1 +
 app/test/meson.build       |   2 +
 app/test/test_stack_perf.c | 343 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 346 insertions(+)
 create mode 100644 app/test/test_stack_perf.c

diff --git a/app/test/Makefile b/app/test/Makefile
index e5bde81af..b28bed2d4 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -91,6 +91,7 @@ endif
 SRCS-y += test_rwlock.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack.c
+SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack_perf.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer.c
 SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_perf.c
diff --git a/app/test/meson.build b/app/test/meson.build
index 56ea13f53..02eb788a4 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -96,6 +96,7 @@ test_sources = files('commands.c',
 	'test_service_cores.c',
 	'test_spinlock.c',
 	'test_stack.c',
+	'test_stack_perf.c',
 	'test_string_fns.c',
 	'test_table.c',
 	'test_table_acl.c',
@@ -241,6 +242,7 @@ perf_test_names = [
         'distributor_perf_autotest',
         'ring_pmd_perf_autotest',
         'pmd_perf_autotest',
+        'stack_perf_autotest',
 ]
 
 # All test cases in driver_test_names list are non-parallel
diff --git a/app/test/test_stack_perf.c b/app/test/test_stack_perf.c
new file mode 100644
index 000000000..484370d30
--- /dev/null
+++ b/app/test/test_stack_perf.c
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <rte_stack.h>
+#include <rte_cycles.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+
+#include "test.h"
+
+#define STACK_NAME "STACK_PERF"
+#define MAX_BURST 32
+#define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
+ * constants.
+ */
+static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
+
+static rte_atomic32_t lcore_barrier;
+
+struct lcore_pair {
+	unsigned int c1;
+	unsigned int c2;
+};
+
+static int
+get_two_hyperthreads(struct lcore_pair *lcp)
+{
+	unsigned int socket[2];
+	unsigned int core[2];
+	unsigned int id[2];
+
+	RTE_LCORE_FOREACH(id[0]) {
+		RTE_LCORE_FOREACH(id[1]) {
+			if (id[0] == id[1])
+				continue;
+			core[0] = lcore_config[id[0]].core_id;
+			core[1] = lcore_config[id[1]].core_id;
+			socket[0] = lcore_config[id[0]].socket_id;
+			socket[1] = lcore_config[id[1]].socket_id;
+			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
+				lcp->c1 = id[0];
+				lcp->c2 = id[1];
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int
+get_two_cores(struct lcore_pair *lcp)
+{
+	unsigned int socket[2];
+	unsigned int core[2];
+	unsigned int id[2];
+
+	RTE_LCORE_FOREACH(id[0]) {
+		RTE_LCORE_FOREACH(id[1]) {
+			if (id[0] == id[1])
+				continue;
+			core[0] = lcore_config[id[0]].core_id;
+			core[1] = lcore_config[id[1]].core_id;
+			socket[0] = lcore_config[id[0]].socket_id;
+			socket[1] = lcore_config[id[1]].socket_id;
+			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
+				lcp->c1 = id[0];
+				lcp->c2 = id[1];
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int
+get_two_sockets(struct lcore_pair *lcp)
+{
+	unsigned int socket[2];
+	unsigned int id[2];
+
+	RTE_LCORE_FOREACH(id[0]) {
+		RTE_LCORE_FOREACH(id[1]) {
+			if (id[0] == id[1])
+				continue;
+			socket[0] = lcore_config[id[0]].socket_id;
+			socket[1] = lcore_config[id[1]].socket_id;
+			if (socket[0] != socket[1]) {
+				lcp->c1 = id[0];
+				lcp->c2 = id[1];
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+/* Measure the cycle cost of popping an empty stack. */
+static void
+test_empty_pop(struct rte_stack *s)
+{
+	unsigned int iterations = 100000000;
+	void *objs[MAX_BURST];
+	unsigned int i;
+
+	uint64_t start = rte_rdtsc();
+
+	for (i = 0; i < iterations; i++)
+		rte_stack_pop(s, objs, bulk_sizes[0]);
+
+	uint64_t end = rte_rdtsc();
+
+	printf("Stack empty pop: %.2F\n",
+	       (double)(end - start) / iterations);
+}
+
+struct thread_args {
+	struct rte_stack *s;
+	unsigned int sz;
+	double avg;
+};
+
+/* Measure the average per-pointer cycle cost of stack push and pop */
+static int
+bulk_push_pop(void *p)
+{
+	unsigned int iterations = 1000000;
+	struct thread_args *args = p;
+	void *objs[MAX_BURST] = {0};
+	unsigned int size, i;
+	struct rte_stack *s;
+
+	s = args->s;
+	size = args->sz;
+
+	rte_atomic32_sub(&lcore_barrier, 1);
+	while (rte_atomic32_read(&lcore_barrier) != 0)
+		rte_pause();
+
+	uint64_t start = rte_rdtsc();
+
+	for (i = 0; i < iterations; i++) {
+		rte_stack_push(s, objs, size);
+		rte_stack_pop(s, objs, size);
+	}
+
+	uint64_t end = rte_rdtsc();
+
+	args->avg = ((double)(end - start))/(iterations * size);
+
+	return 0;
+}
+
+/*
+ * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
+ * perf when between hyperthread siblings, cores on the same socket, and cores
+ * on different sockets.
+ */
+static void
+run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
+		 lcore_function_t fn)
+{
+	struct thread_args args[2];
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+		rte_atomic32_set(&lcore_barrier, 2);
+
+		args[0].sz = args[1].sz = bulk_sizes[i];
+		args[0].s = args[1].s = s;
+
+		if (cores->c1 == rte_get_master_lcore()) {
+			rte_eal_remote_launch(fn, &args[1], cores->c2);
+			fn(&args[0]);
+			rte_eal_wait_lcore(cores->c2);
+		} else {
+			rte_eal_remote_launch(fn, &args[0], cores->c1);
+			rte_eal_remote_launch(fn, &args[1], cores->c2);
+			rte_eal_wait_lcore(cores->c1);
+			rte_eal_wait_lcore(cores->c2);
+		}
+
+		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
+	}
+}
+
+/* Run bulk_push_pop() simultaneously on 1+ cores. */
+static void
+run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
+{
+	struct thread_args args[RTE_MAX_LCORE];
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+		unsigned int lcore_id;
+		int cnt = 0;
+		double avg;
+
+		rte_atomic32_set(&lcore_barrier, n);
+
+		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+			if (++cnt >= n)
+				break;
+
+			args[lcore_id].s = s;
+			args[lcore_id].sz = bulk_sizes[i];
+
+			if (rte_eal_remote_launch(fn, &args[lcore_id],
+						  lcore_id))
+				rte_panic("Failed to launch lcore %d\n",
+					  lcore_id);
+		}
+
+		lcore_id = rte_lcore_id();
+
+		args[lcore_id].s = s;
+		args[lcore_id].sz = bulk_sizes[i];
+
+		fn(&args[lcore_id]);
+
+		rte_eal_mp_wait_lcore();
+
+		avg = args[rte_lcore_id()].avg;
+
+		cnt = 0;
+		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+			if (++cnt >= n)
+				break;
+			avg += args[lcore_id].avg;
+		}
+
+		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+		       bulk_sizes[i], avg / n);
+	}
+}
+
+/*
+ * Measure the cycle cost of pushing and popping a single pointer on a single
+ * lcore.
+ */
+static void
+test_single_push_pop(struct rte_stack *s)
+{
+	unsigned int iterations = 16000000;
+	void *obj = NULL;
+	unsigned int i;
+
+	uint64_t start = rte_rdtsc();
+
+	for (i = 0; i < iterations; i++) {
+		rte_stack_push(s, &obj, 1);
+		rte_stack_pop(s, &obj, 1);
+	}
+
+	uint64_t end = rte_rdtsc();
+
+	printf("Average cycles per single object push/pop: %.2F\n",
+	       ((double)(end - start)) / iterations);
+}
+
+/* Measure the cycle cost of bulk pushing and popping on a single lcore. */
+static void
+test_bulk_push_pop(struct rte_stack *s)
+{
+	unsigned int iterations = 8000000;
+	void *objs[MAX_BURST];
+	unsigned int sz, i;
+
+	for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
+		uint64_t start = rte_rdtsc();
+
+		for (i = 0; i < iterations; i++) {
+			rte_stack_push(s, objs, bulk_sizes[sz]);
+			rte_stack_pop(s, objs, bulk_sizes[sz]);
+		}
+
+		uint64_t end = rte_rdtsc();
+
+		double avg = ((double)(end - start) /
+			      (iterations * bulk_sizes[sz]));
+
+		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+		       bulk_sizes[sz], avg);
+	}
+}
+
+static int
+test_stack_perf(void)
+{
+	struct lcore_pair cores;
+	struct rte_stack *s;
+
+	rte_atomic32_init(&lcore_barrier);
+
+	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), 0);
+	if (s == NULL) {
+		printf("[%s():%u] failed to create a stack\n",
+		       __func__, __LINE__);
+		return -1;
+	}
+
+	printf("### Testing single element push/pop ###\n");
+	test_single_push_pop(s);
+
+	printf("\n### Testing empty pop ###\n");
+	test_empty_pop(s);
+
+	printf("\n### Testing using a single lcore ###\n");
+	test_bulk_push_pop(s);
+
+	if (get_two_hyperthreads(&cores) == 0) {
+		printf("\n### Testing using two hyperthreads ###\n");
+		run_on_core_pair(&cores, s, bulk_push_pop);
+	}
+	if (get_two_cores(&cores) == 0) {
+		printf("\n### Testing using two physical cores ###\n");
+		run_on_core_pair(&cores, s, bulk_push_pop);
+	}
+	if (get_two_sockets(&cores) == 0) {
+		printf("\n### Testing using two NUMA nodes ###\n");
+		run_on_core_pair(&cores, s, bulk_push_pop);
+	}
+
+	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
+	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
+
+	rte_stack_free(s);
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
-- 
2.13.6