DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support
@ 2020-12-16 16:49 Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

This patchset enhances the regex application to support multi Q with multi cores.

v1: initial release

Ophir Munk (6):
  app/regex: move mem pool creation to worker routine
  app/regex: support multi QPs
  app/regex: read data file once at startup
  app/regex: support multi cores
  app/regex: support performance measurements per QP
  app/regex: replace Linux clock() API with rdtsc

 app/test-regex/main.c | 519 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 373 insertions(+), 146 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v1 1/6] app/regex: move mem pool creation to worker routine
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
@ 2020-12-16 16:49 ` Ophir Munk
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 2/6] app/regex: support multi QPs Ophir Munk
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

Function rte_pktmbuf_pool_create() is moved from init_port() routine to
run_regex() routine. Looking forward on multi core support - init_port()
will be called only once as part of application startup while mem pool
creation should be called multiple times (per core).

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index ac6152d..cb2a065 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -163,8 +163,7 @@ read_file(char *file, char **buf)
 }
 
 static int
-init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
-	  uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
+init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 {
 	uint16_t id;
 	uint16_t num_devs;
@@ -187,14 +186,6 @@ init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
 		return -EINVAL;
 	}
 
-	*mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
-					  0, MBUF_SIZE, rte_socket_id());
-	if (*mbuf_mp == NULL) {
-		printf("Error, can't create memory pool\n");
-		res = -ENOMEM;
-		goto error;
-	}
-
 	rules_len = read_file(rules_file, &rules);
 	if (rules_len < 0) {
 		printf("Error, can't read rules files.\n");
@@ -237,8 +228,6 @@ init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
 error:
 	if (rules)
 		rte_free(rules);
-	if (*mbuf_mp)
-		rte_mempool_free(*mbuf_mp);
 	return res;
 }
 
@@ -248,7 +237,7 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 }
 
 static int
-run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
+run_regex(uint32_t nb_jobs,
 	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
 	  char *data_file, uint8_t nb_max_matches)
 {
@@ -273,9 +262,17 @@ run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
 	time_t end;
 	double time;
 	struct job_ctx *jobs_ctx;
+	struct rte_mempool *mbuf_mp;
 
 	shinfo.free_cb = extbuf_free_cb;
 
+	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
+			0, MBUF_SIZE, rte_socket_id());
+	if (mbuf_mp == NULL) {
+		printf("Error, can't create memory pool\n");
+		return -ENOMEM;
+	}
+
 	ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
 	if (!ops) {
 		printf("Error, can't allocate memory for ops.\n");
@@ -409,6 +406,9 @@ run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
 	rte_free(jobs_ctx);
 	if (buf)
 		rte_free(buf);
+	if (mbuf_mp)
+		rte_mempool_free(mbuf_mp);
+
 	return res;
 }
 
@@ -417,7 +417,6 @@ main(int argc, char **argv)
 {
 	char rules_file[MAX_FILE_NAME];
 	char data_file[MAX_FILE_NAME];
-	struct rte_mempool *mbuf_mp = NULL;
 	uint32_t nb_jobs = 0;
 	uint16_t nb_max_payload = 0;
 	bool perf_mode = 0;
@@ -434,16 +433,13 @@ main(int argc, char **argv)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
 			   &perf_mode, &nb_iterations);
 
-	ret = init_port(&mbuf_mp, nb_jobs, &nb_max_payload, rules_file,
-			&nb_max_matches);
+	ret = init_port(&nb_max_payload, rules_file, &nb_max_matches);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
-	ret = run_regex(mbuf_mp, nb_jobs, nb_max_payload, perf_mode,
+	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
 			nb_iterations, data_file, nb_max_matches);
 	if (ret < 0) {
-		rte_mempool_free(mbuf_mp);
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
-	rte_mempool_free(mbuf_mp);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v1 2/6] app/regex: support multi QPs
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
@ 2020-12-16 16:49 ` Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 3/6] app/regex: read data file once at startup Ophir Munk
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

Up to this commit the regex application used one QP which was assigned a
number of jobs, each with a different segment of a file to parse.  This
commit adds support for multiple QPs assignments. All QPs will be
assigned the same number of jobs, with the same segments of file to
parse. It will enable comparing functionality with different numbers of
QPs. All queues are managed on one core with one thread. This commit
focuses on changing routines API to support multi QPs, mainly, QP scalar
variables are replaced by per-QP struct instance.  The enque/deque
operations are interleaved as follows:
 enque(QP #1)
 enque(QP #2)
 ...
 enque(QP #n)
 deque(QP #1)
 deque(QP #2)
 ...
 deque(QP #n)

A new parameter 'nb_qps' was added to configure the number of QPs:
 --nb_qps <num of qps>.
If not configured, nb_qps is set to 1 by defalut.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 322 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 204 insertions(+), 118 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index cb2a065..d225267 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -33,12 +33,22 @@ enum app_args {
 	ARG_NUM_OF_JOBS,
 	ARG_PERF_MODE,
 	ARG_NUM_OF_ITERATIONS,
+	ARG_NUM_OF_QPS,
 };
 
 struct job_ctx {
 	struct rte_mbuf *mbuf;
 };
 
+struct qp_params {
+	uint32_t total_enqueue;
+	uint32_t total_dequeue;
+	uint32_t total_matches;
+	struct rte_regex_ops **ops;
+	struct job_ctx *jobs_ctx;
+	char *buf;
+};
+
 static void
 usage(const char *prog_name)
 {
@@ -47,13 +57,15 @@ usage(const char *prog_name)
 		" --data NAME: data file to use\n"
 		" --nb_jobs: number of jobs to use\n"
 		" --perf N: only outputs the performance data\n"
-		" --nb_iter N: number of iteration to run\n",
+		" --nb_iter N: number of iteration to run\n"
+		" --nb_qps N: number of queues to use\n",
 		prog_name);
 }
 
 static void
 args_parse(int argc, char **argv, char *rules_file, char *data_file,
-	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations)
+	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations,
+	   uint32_t *nb_qps)
 {
 	char **argvopt;
 	int opt;
@@ -71,6 +83,8 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		{ "perf", 0, 0, ARG_PERF_MODE},
 		/* Number of iterations to run with perf test */
 		{ "nb_iter", 1, 0, ARG_NUM_OF_ITERATIONS},
+		/* Number of QPs. */
+		{ "nb_qps", 1, 0, ARG_NUM_OF_QPS},
 		/* End of options */
 		{ 0, 0, 0, 0 }
 	};
@@ -104,6 +118,9 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		case ARG_NUM_OF_ITERATIONS:
 			*nb_iterations = atoi(optarg);
 			break;
+		case ARG_NUM_OF_QPS:
+			*nb_qps = atoi(optarg);
+			break;
 		case ARG_HELP:
 			usage("RegEx test app");
 			break;
@@ -163,15 +180,17 @@ read_file(char *file, char **buf)
 }
 
 static int
-init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
+init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches,
+	  uint32_t nb_qps)
 {
 	uint16_t id;
+	uint16_t qp_id;
 	uint16_t num_devs;
 	char *rules = NULL;
 	long rules_len;
 	struct rte_regexdev_info info;
 	struct rte_regexdev_config dev_conf = {
-		.nb_queue_pairs = 1,
+		.nb_queue_pairs = nb_qps,
 		.nb_groups = 1,
 	};
 	struct rte_regexdev_qp_conf qp_conf = {
@@ -203,7 +222,8 @@ init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 		*nb_max_matches = info.max_matches;
 		*nb_max_payload = info.max_payload_size;
 		if (info.regexdev_capa & RTE_REGEXDEV_SUPP_MATCH_AS_END_F)
-			dev_conf.dev_cfg_flags |= RTE_REGEXDEV_CFG_MATCH_AS_END_F;
+			dev_conf.dev_cfg_flags |=
+			RTE_REGEXDEV_CFG_MATCH_AS_END_F;
 		dev_conf.nb_max_matches = info.max_matches;
 		dev_conf.nb_rules_per_group = info.max_rules_per_group;
 		dev_conf.rule_db_len = rules_len;
@@ -214,12 +234,16 @@ init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 			goto error;
 		}
 		if (info.regexdev_capa & RTE_REGEXDEV_CAPA_QUEUE_PAIR_OOS_F)
-			qp_conf.qp_conf_flags |= RTE_REGEX_QUEUE_PAIR_CFG_OOS_F;
-		res = rte_regexdev_queue_pair_setup(id, 0, &qp_conf);
-		if (res < 0) {
-			printf("Error, can't setup queue pair for device %d.\n",
-			       id);
-			goto error;
+			qp_conf.qp_conf_flags |=
+			RTE_REGEX_QUEUE_PAIR_CFG_OOS_F;
+		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+			res = rte_regexdev_queue_pair_setup(id, qp_id,
+							    &qp_conf);
+			if (res < 0) {
+				printf("Error, can't setup queue pair %u for "
+				       "device %d.\n", qp_id, id);
+				goto error;
+			}
 		}
 		printf(":: initializing device: %d done\n", id);
 	}
@@ -239,122 +263,171 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 static int
 run_regex(uint32_t nb_jobs,
 	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
-	  char *data_file, uint8_t nb_max_matches)
+	  char *data_file, uint8_t nb_max_matches, uint32_t nb_qps)
 {
 	char *buf = NULL;
-	long buf_len;
-	long job_len;
+	long buf_len = 0;
+	long job_len = 0;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
-	struct rte_regex_ops **ops;
+	uint16_t qp_id;
 	uint16_t dev_id = 0;
-	uint16_t qp_id = 0;
 	uint8_t nb_matches;
 	struct rte_regexdev_match *match;
-	long pos = 0;
+	long pos;
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
-	uint32_t total_enqueue = 0;
-	uint32_t total_dequeue = 0;
-	uint32_t total_matches = 0;
 	int res = 0;
 	time_t start;
 	time_t end;
 	double time;
-	struct job_ctx *jobs_ctx;
 	struct rte_mempool *mbuf_mp;
+	struct qp_params *qp;
+	struct qp_params *qps = NULL;
+	bool update;
+	uint16_t qps_used = 0;
 
 	shinfo.free_cb = extbuf_free_cb;
-
-	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
+	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs * nb_qps, 0,
 			0, MBUF_SIZE, rte_socket_id());
 	if (mbuf_mp == NULL) {
 		printf("Error, can't create memory pool\n");
 		return -ENOMEM;
 	}
 
-	ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
-	if (!ops) {
-		printf("Error, can't allocate memory for ops.\n");
-		return -ENOMEM;
-	}
-
-	jobs_ctx = rte_malloc(NULL, sizeof(struct job_ctx)*nb_jobs, 0);
-	if (!jobs_ctx) {
-		printf("Error, can't allocate memory for jobs_ctx.\n");
-		return -ENOMEM;
+	qps = rte_malloc(NULL, sizeof(*qps) * nb_qps, 0);
+	if (!qps) {
+		printf("Error, can't allocate memory for QPs\n");
+		res = -ENOMEM;
+		goto end;
 	}
 
-	/* Allocate the jobs and assign each job with an mbuf. */
-	for (i = 0; i < nb_jobs; i++) {
-		ops[i] = rte_malloc(NULL, sizeof(*ops[0]) + nb_max_matches *
-				    sizeof(struct rte_regexdev_match), 0);
-		if (!ops[i]) {
-			printf("Error, can't allocate memory for op.\n");
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		struct rte_regex_ops **ops;
+		struct job_ctx *jobs_ctx;
+
+		qps_used++;
+		qp = &qps[qp_id];
+		qp->jobs_ctx = NULL;
+		qp->buf = NULL;
+		qp->ops = ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
+		if (!ops) {
+			printf("Error, can't allocate memory for ops.\n");
 			res = -ENOMEM;
 			goto end;
 		}
-		ops[i]->mbuf = rte_pktmbuf_alloc(mbuf_mp);
-		if (!ops[i]->mbuf) {
-			printf("Error, can't attach mbuf.\n");
+
+		qp->jobs_ctx = jobs_ctx =
+			rte_malloc(NULL, sizeof(*jobs_ctx) * nb_jobs, 0);
+		if (!jobs_ctx) {
+			printf("Error, can't allocate memory for jobs_ctx.\n");
 			res = -ENOMEM;
 			goto end;
 		}
-	}
 
-	buf_len = read_file(data_file, &buf);
-	if (buf_len <= 0) {
-		printf("Error, can't read file, or file is empty.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		/* Allocate the jobs and assign each job with an mbuf. */
+		for (i = 0; i < nb_jobs; i++) {
+			ops[i] = rte_malloc(NULL, sizeof(*ops[0]) +
+					nb_max_matches *
+					sizeof(struct rte_regexdev_match), 0);
+			if (!ops[i]) {
+				printf("Error, can't allocate "
+				       "memory for op.\n");
+				res = -ENOMEM;
+				goto end;
+			}
+			ops[i]->mbuf = rte_pktmbuf_alloc(mbuf_mp);
+			if (!ops[i]->mbuf) {
+				printf("Error, can't attach mbuf.\n");
+				res = -ENOMEM;
+				goto end;
+			}
+		}
 
-	job_len = buf_len / nb_jobs;
-	if (job_len == 0) {
-		printf("Error, To many jobs, for the given input.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		buf_len = read_file(data_file, &buf);
+		if (buf_len <= 0) {
+			printf("Error, can't read file, or file is empty.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
 
-	if (job_len > nb_max_payload) {
-		printf("Error, not enough jobs to cover input.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		job_len = buf_len / nb_jobs;
+		if (job_len == 0) {
+			printf("Error, To many jobs, for the given input.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
+
+		if (job_len > nb_max_payload) {
+			printf("Error, not enough jobs to cover input.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
 
-	/* Assign each mbuf with the data to handle. */
-	for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
-		long act_job_len = RTE_MIN(job_len, buf_len - pos);
-		rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
-					  act_job_len, &shinfo);
-		jobs_ctx[i].mbuf = ops[i]->mbuf;
-		ops[i]->mbuf->data_len = job_len;
-		ops[i]->mbuf->pkt_len = act_job_len;
-		ops[i]->user_id = i;
-		ops[i]->group_id0 = 1;
-		pos += act_job_len;
-		actual_jobs++;
+		/* Assign each mbuf with the data to handle. */
+		actual_jobs = 0;
+		pos = 0;
+		for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
+			long act_job_len = RTE_MIN(job_len, buf_len - pos);
+			rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
+					act_job_len, &shinfo);
+			jobs_ctx[i].mbuf = ops[i]->mbuf;
+			ops[i]->mbuf->data_len = job_len;
+			ops[i]->mbuf->pkt_len = act_job_len;
+			ops[i]->user_id = i;
+			ops[i]->group_id0 = 1;
+			pos += act_job_len;
+			actual_jobs++;
+		}
+
+		qp->buf = buf;
+		qp->total_matches = 0;
 	}
 
 	start = clock();
 	for (i = 0; i < nb_iterations; i++) {
-		total_enqueue = 0;
-		total_dequeue = 0;
-		while (total_dequeue < actual_jobs) {
-			struct rte_regex_ops **cur_ops_to_enqueue = ops +
-				total_enqueue;
-			struct rte_regex_ops **cur_ops_to_dequeue = ops +
-				total_dequeue;
-
-			if (actual_jobs - total_enqueue)
-				total_enqueue += rte_regexdev_enqueue_burst
-					(dev_id, qp_id, cur_ops_to_enqueue,
-					 actual_jobs - total_enqueue);
-
-			total_dequeue += rte_regexdev_dequeue_burst
-				(dev_id, qp_id, cur_ops_to_dequeue,
-				 total_enqueue - total_dequeue);
+		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+			qp = &qps[qp_id];
+			qp->total_enqueue = 0;
+			qp->total_dequeue = 0;
 		}
+		do {
+			update = false;
+			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+				qp = &qps[qp_id];
+				if (qp->total_dequeue < actual_jobs) {
+					struct rte_regex_ops **
+						cur_ops_to_enqueue = qp->ops +
+						qp->total_enqueue;
+
+					if (actual_jobs - qp->total_enqueue)
+						qp->total_enqueue +=
+						rte_regexdev_enqueue_burst
+							(dev_id,
+							qp_id,
+							cur_ops_to_enqueue,
+							actual_jobs -
+							qp->total_enqueue);
+				}
+			}
+			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+				qp = &qps[qp_id];
+				if (qp->total_dequeue < actual_jobs) {
+					struct rte_regex_ops **
+						cur_ops_to_dequeue = qp->ops +
+						qp->total_dequeue;
+
+					qp->total_dequeue +=
+						rte_regexdev_dequeue_burst
+							(dev_id,
+							qp_id,
+							cur_ops_to_dequeue,
+							qp->total_enqueue -
+							qp->total_dequeue);
+					update = true;
+				}
+			}
+		} while (update);
 	}
 	end = clock();
 	time = ((double)end - start) / CLOCKS_PER_SEC;
@@ -364,51 +437,59 @@ run_regex(uint32_t nb_jobs,
 	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
 		1000000000.0);
 
-	if (!perf_mode) {
+	if (perf_mode)
+		goto end;
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		printf("\n############ QP id=%u ############\n", qp_id);
+		qp = &qps[qp_id];
 		/* Log results per job. */
-		for (d_ind = 0; d_ind < total_dequeue; d_ind++) {
-			nb_matches = ops[d_ind % actual_jobs]->nb_matches;
+		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
+			nb_matches = qp->ops[d_ind % actual_jobs]->nb_matches;
 			printf("Job id %"PRIu64" number of matches = %d\n",
-			       ops[d_ind]->user_id, nb_matches);
-			total_matches += nb_matches;
-			match = ops[d_ind % actual_jobs]->matches;
+					qp->ops[d_ind]->user_id, nb_matches);
+			qp->total_matches += nb_matches;
+			match = qp->ops[d_ind % actual_jobs]->matches;
 			for (i = 0; i < nb_matches; i++) {
-				printf("match %d, rule = %d, start = %d,len = %d\n",
+				printf("match %d, rule = %d, "
+				       "start = %d,len = %d\n",
 				       i, match->rule_id, match->start_offset,
 				       match->len);
 				match++;
 			}
 		}
-		printf("Total matches = %d\n", total_matches);
+		printf("Total matches = %d\n", qp->total_matches);
 		printf("All Matches:\n");
-
 		/* Log absolute results. */
-		for (d_ind = 0; d_ind < total_dequeue; d_ind++) {
-			nb_matches = ops[d_ind % actual_jobs]->nb_matches;
-			total_matches += nb_matches;
-			match = ops[d_ind % actual_jobs]->matches;
+		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
+			nb_matches = qp->ops[d_ind % actual_jobs]->nb_matches;
+			qp->total_matches += nb_matches;
+			match = qp->ops[d_ind % actual_jobs]->matches;
 			for (i = 0; i < nb_matches; i++) {
 				printf("start = %ld, len = %d, rule = %d\n",
-				       match->start_offset + d_ind * job_len,
-				       match->len, match->rule_id);
+						match->start_offset +
+						d_ind * job_len,
+						match->len, match->rule_id);
 				match++;
 			}
 		}
 	}
 end:
-	for (i = 0; i < actual_jobs; i++) {
-		if (ops[i])
-			rte_free(ops[i]);
-		if (jobs_ctx[i].mbuf)
-			rte_pktmbuf_free(jobs_ctx[i].mbuf);
+	for (qp_id = 0; qp_id < qps_used; qp_id++) {
+		qp = &qps[qp_id];
+		for (i = 0; i < actual_jobs && qp->ops; i++)
+			rte_free(qp->ops[i]);
+		rte_free(qp->ops);
+		qp->ops = NULL;
+		for (i = 0; i < actual_jobs && qp->jobs_ctx; i++)
+			rte_pktmbuf_free(qp->jobs_ctx[i].mbuf);
+		rte_free(qp->jobs_ctx);
+		qp->jobs_ctx = NULL;
+		rte_free(qp->buf);
+		qp->buf = NULL;
 	}
-	rte_free(ops);
-	rte_free(jobs_ctx);
-	if (buf)
-		rte_free(buf);
 	if (mbuf_mp)
 		rte_mempool_free(mbuf_mp);
-
+	rte_free(qps);
 	return res;
 }
 
@@ -418,12 +499,14 @@ main(int argc, char **argv)
 	char rules_file[MAX_FILE_NAME];
 	char data_file[MAX_FILE_NAME];
 	uint32_t nb_jobs = 0;
-	uint16_t nb_max_payload = 0;
 	bool perf_mode = 0;
 	uint32_t nb_iterations = 0;
-	uint8_t nb_max_matches = 0;
 	int ret;
+	uint16_t nb_max_payload = 0;
+	uint8_t nb_max_matches = 0;
+	uint32_t nb_qps = 1;
 
+	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "EAL init failed\n");
@@ -431,13 +514,16 @@ main(int argc, char **argv)
 	argv += ret;
 	if (argc > 1)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
-			   &perf_mode, &nb_iterations);
+				&perf_mode, &nb_iterations, &nb_qps);
 
-	ret = init_port(&nb_max_payload, rules_file, &nb_max_matches);
+	if (nb_qps == 0)
+		rte_exit(EXIT_FAILURE, "Number of QPs must be greater than 0\n");
+	ret = init_port(&nb_max_payload, rules_file,
+			&nb_max_matches, nb_qps);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
 	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
-			nb_iterations, data_file, nb_max_matches);
+			nb_iterations, data_file, nb_max_matches, nb_qps);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v1 3/6] app/regex: read data file once at startup
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 2/6] app/regex: support multi QPs Ophir Munk
@ 2020-12-16 16:49 ` Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 4/6] app/regex: support multi cores Ophir Munk
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

Up to this commit the input data file was read from scratch for each QP,
which is redundant. Starting from this commit the data file is read only
once at startup. Each QP will clone the data.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 63 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index d225267..9bafd02 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -180,6 +180,19 @@ read_file(char *file, char **buf)
 }
 
 static int
+clone_buf(char *data_buf, char **buf, long data_len)
+{
+	char *dest_buf;
+	dest_buf =
+		rte_malloc(NULL, sizeof(char) * (data_len + 1), 4096);
+	if (!dest_buf)
+		return -ENOMEM;
+	memcpy(dest_buf, data_buf, data_len + 1);
+	*buf = dest_buf;
+	return 0;
+}
+
+static int
 init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches,
 	  uint32_t nb_qps)
 {
@@ -262,12 +275,11 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 
 static int
 run_regex(uint32_t nb_jobs,
-	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
-	  char *data_file, uint8_t nb_max_matches, uint32_t nb_qps)
+	  bool perf_mode, uint32_t nb_iterations,
+	  uint8_t nb_max_matches, uint32_t nb_qps,
+	  char *data_buf, long data_len, long job_len)
 {
 	char *buf = NULL;
-	long buf_len = 0;
-	long job_len = 0;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
 	uint16_t qp_id;
@@ -344,22 +356,8 @@ run_regex(uint32_t nb_jobs,
 			}
 		}
 
-		buf_len = read_file(data_file, &buf);
-		if (buf_len <= 0) {
-			printf("Error, can't read file, or file is empty.\n");
-			res = -EXIT_FAILURE;
-			goto end;
-		}
-
-		job_len = buf_len / nb_jobs;
-		if (job_len == 0) {
-			printf("Error, To many jobs, for the given input.\n");
-			res = -EXIT_FAILURE;
-			goto end;
-		}
-
-		if (job_len > nb_max_payload) {
-			printf("Error, not enough jobs to cover input.\n");
+		if (clone_buf(data_buf, &buf, data_len)) {
+			printf("Error, can't clone buf.\n");
 			res = -EXIT_FAILURE;
 			goto end;
 		}
@@ -367,8 +365,8 @@ run_regex(uint32_t nb_jobs,
 		/* Assign each mbuf with the data to handle. */
 		actual_jobs = 0;
 		pos = 0;
-		for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
-			long act_job_len = RTE_MIN(job_len, buf_len - pos);
+		for (i = 0; (pos < data_len) && (i < nb_jobs) ; i++) {
+			long act_job_len = RTE_MIN(job_len, data_len - pos);
 			rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
 					act_job_len, &shinfo);
 			jobs_ctx[i].mbuf = ops[i]->mbuf;
@@ -505,6 +503,9 @@ main(int argc, char **argv)
 	uint16_t nb_max_payload = 0;
 	uint8_t nb_max_matches = 0;
 	uint32_t nb_qps = 1;
+	char *data_buf;
+	long data_len;
+	long job_len;
 
 	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
@@ -522,10 +523,24 @@ main(int argc, char **argv)
 			&nb_max_matches, nb_qps);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
-	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
-			nb_iterations, data_file, nb_max_matches, nb_qps);
+
+	data_len = read_file(data_file, &data_buf);
+	if (data_len <= 0)
+		rte_exit(EXIT_FAILURE, "Error, can't read file, or file is empty.\n");
+
+	job_len = data_len / nb_jobs;
+	if (job_len == 0)
+		rte_exit(EXIT_FAILURE, "Error, To many jobs, for the given input.\n");
+
+	if (job_len > nb_max_payload)
+		rte_exit(EXIT_FAILURE, "Error, not enough jobs to cover input.\n");
+
+	ret = run_regex(nb_jobs, perf_mode,
+			nb_iterations, nb_max_matches, nb_qps,
+			data_buf, data_len, job_len);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
+	rte_free(data_buf);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v1 4/6] app/regex: support multi cores
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
                   ` (2 preceding siblings ...)
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 3/6] app/regex: read data file once at startup Ophir Munk
@ 2020-12-16 16:49 ` Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 5/6] app/regex: support performance measurements per QP Ophir Munk
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

Up to this commit the regex application was running with multiple QPs on
a single core.  This commit adds the option to specify a number of cores
on which multiple QPs will run.
A new parameter 'nb_lcores' was added to configure the number of cores:
--nb_lcores <num of cores>.
If not configured the number of cores is set to 1 by default.  On
application startup a few initial steps occur by the main core: the
number of QPs and cores are parsed.  The QPs are distributed as evenly
as possible on the cores.  The regex device and all QPs are initialized.
The data file is read and saved in a buffer. Then for each core the
application calls rte_eal_remote_launch() with the worker routine
(run_regex) as its parameter.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 155 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 139 insertions(+), 16 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 9bafd02..720eb1c 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -34,6 +34,7 @@ enum app_args {
 	ARG_PERF_MODE,
 	ARG_NUM_OF_ITERATIONS,
 	ARG_NUM_OF_QPS,
+	ARG_NUM_OF_LCORES,
 };
 
 struct job_ctx {
@@ -49,6 +50,26 @@ struct qp_params {
 	char *buf;
 };
 
+struct qps_per_lcore {
+	unsigned int lcore_id;
+	int socket;
+	uint16_t qp_id_base;
+	uint16_t nb_qps;
+};
+
+struct regex_conf {
+	uint32_t nb_jobs;
+	bool perf_mode;
+	uint32_t nb_iterations;
+	char *data_file;
+	uint8_t nb_max_matches;
+	uint32_t nb_qps;
+	uint16_t qp_id_base;
+	char *data_buf;
+	long data_len;
+	long job_len;
+};
+
 static void
 usage(const char *prog_name)
 {
@@ -58,14 +79,15 @@ usage(const char *prog_name)
 		" --nb_jobs: number of jobs to use\n"
 		" --perf N: only outputs the performance data\n"
 		" --nb_iter N: number of iteration to run\n"
-		" --nb_qps N: number of queues to use\n",
+		" --nb_qps N: number of queues to use\n"
+		" --nb_lcores N: number of lcores to use\n",
 		prog_name);
 }
 
 static void
 args_parse(int argc, char **argv, char *rules_file, char *data_file,
 	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations,
-	   uint32_t *nb_qps)
+	   uint32_t *nb_qps, uint32_t *nb_lcores)
 {
 	char **argvopt;
 	int opt;
@@ -85,6 +107,8 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		{ "nb_iter", 1, 0, ARG_NUM_OF_ITERATIONS},
 		/* Number of QPs. */
 		{ "nb_qps", 1, 0, ARG_NUM_OF_QPS},
+		/* Number of lcores. */
+		{ "nb_lcores", 1, 0, ARG_NUM_OF_LCORES},
 		/* End of options */
 		{ 0, 0, 0, 0 }
 	};
@@ -121,6 +145,9 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		case ARG_NUM_OF_QPS:
 			*nb_qps = atoi(optarg);
 			break;
+		case ARG_NUM_OF_LCORES:
+			*nb_lcores = atoi(optarg);
+			break;
 		case ARG_HELP:
 			usage("RegEx test app");
 			break;
@@ -274,11 +301,18 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 }
 
 static int
-run_regex(uint32_t nb_jobs,
-	  bool perf_mode, uint32_t nb_iterations,
-	  uint8_t nb_max_matches, uint32_t nb_qps,
-	  char *data_buf, long data_len, long job_len)
+run_regex(void *args)
 {
+	struct regex_conf *rgxc = args;
+	uint32_t nb_jobs = rgxc->nb_jobs;
+	uint32_t nb_iterations = rgxc->nb_iterations;
+	uint8_t nb_max_matches = rgxc->nb_max_matches;
+	uint32_t nb_qps = rgxc->nb_qps;
+	uint16_t qp_id_base  = rgxc->qp_id_base;
+	char *data_buf = rgxc->data_buf;
+	long data_len = rgxc->data_len;
+	long job_len = rgxc->job_len;
+
 	char *buf = NULL;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
@@ -298,9 +332,13 @@ run_regex(uint32_t nb_jobs,
 	struct qp_params *qps = NULL;
 	bool update;
 	uint16_t qps_used = 0;
+	char mbuf_pool[16];
 
 	shinfo.free_cb = extbuf_free_cb;
-	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs * nb_qps, 0,
+	snprintf(mbuf_pool,
+		 sizeof(mbuf_pool),
+		 "mbuf_pool_%2u", qp_id_base);
+	mbuf_mp = rte_pktmbuf_pool_create(mbuf_pool, nb_jobs * nb_qps, 0,
 			0, MBUF_SIZE, rte_socket_id());
 	if (mbuf_mp == NULL) {
 		printf("Error, can't create memory pool\n");
@@ -402,7 +440,7 @@ run_regex(uint32_t nb_jobs,
 						qp->total_enqueue +=
 						rte_regexdev_enqueue_burst
 							(dev_id,
-							qp_id,
+							qp_id_base + qp_id,
 							cur_ops_to_enqueue,
 							actual_jobs -
 							qp->total_enqueue);
@@ -418,7 +456,7 @@ run_regex(uint32_t nb_jobs,
 					qp->total_dequeue +=
 						rte_regexdev_dequeue_burst
 							(dev_id,
-							qp_id,
+							qp_id_base + qp_id,
 							cur_ops_to_dequeue,
 							qp->total_enqueue -
 							qp->total_dequeue);
@@ -435,7 +473,7 @@ run_regex(uint32_t nb_jobs,
 	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
 		1000000000.0);
 
-	if (perf_mode)
+	if (rgxc->perf_mode)
 		goto end;
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 		printf("\n############ QP id=%u ############\n", qp_id);
@@ -491,6 +529,67 @@ run_regex(uint32_t nb_jobs,
 	return res;
 }
 
+static int
+distribute_qps_to_lcores(uint32_t nb_cores, uint32_t nb_qps,
+			 struct qps_per_lcore **qpl)
+{
+	int socket;
+	unsigned lcore_id;
+	uint32_t i;
+	uint16_t min_qp_id;
+	uint16_t max_qp_id;
+	struct qps_per_lcore *qps_per_lcore;
+	uint32_t detected_lcores;
+
+	if (nb_qps < nb_cores) {
+		nb_cores = nb_qps;
+		printf("Reducing number of cores to number of QPs (%u)\n",
+		       nb_cores);
+	}
+	/* Allocate qps_per_lcore array */
+	qps_per_lcore =
+		rte_malloc(NULL, sizeof(*qps_per_lcore) * nb_cores, 0);
+	if (!qps_per_lcore)
+		rte_exit(EXIT_FAILURE, "Falied to create qps_per_lcore array\n");
+	*qpl = qps_per_lcore;
+	detected_lcores = 0;
+	min_qp_id = 0;
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		if (detected_lcores >= nb_cores)
+			break;
+		qps_per_lcore[detected_lcores].lcore_id = lcore_id;
+		socket = rte_lcore_to_socket_id(lcore_id);
+		if (socket == SOCKET_ID_ANY)
+			socket = 0;
+		qps_per_lcore[detected_lcores].socket = socket;
+		qps_per_lcore[detected_lcores].qp_id_base = min_qp_id;
+		max_qp_id = min_qp_id + nb_qps / nb_cores - 1;
+		if (nb_qps % nb_cores > detected_lcores)
+			max_qp_id++;
+		qps_per_lcore[detected_lcores].nb_qps = max_qp_id -
+							min_qp_id + 1;
+		min_qp_id = max_qp_id + 1;
+		detected_lcores++;
+	}
+	if (detected_lcores != nb_cores)
+		return -1;
+
+	for (i = 0; i < detected_lcores; i++) {
+		printf("===> Core %d: allocated queues: ",
+		       qps_per_lcore[i].lcore_id);
+		min_qp_id = qps_per_lcore[i].qp_id_base;
+		max_qp_id =
+			qps_per_lcore[i].qp_id_base + qps_per_lcore[i].nb_qps;
+		while (min_qp_id < max_qp_id) {
+			printf("%u ", min_qp_id);
+			min_qp_id++;
+		}
+		printf("\n");
+	}
+	return 0;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -506,6 +605,10 @@ main(int argc, char **argv)
 	char *data_buf;
 	long data_len;
 	long job_len;
+	uint32_t nb_lcores = 1;
+	struct regex_conf *rgxc;
+	uint32_t i;
+	struct qps_per_lcore *qps_per_lcore;
 
 	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
@@ -515,10 +618,15 @@ main(int argc, char **argv)
 	argv += ret;
 	if (argc > 1)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
-				&perf_mode, &nb_iterations, &nb_qps);
+				&perf_mode, &nb_iterations, &nb_qps,
+				&nb_lcores);
 
 	if (nb_qps == 0)
 		rte_exit(EXIT_FAILURE, "Number of QPs must be greater than 0\n");
+	if (nb_lcores == 0)
+		rte_exit(EXIT_FAILURE, "Number of lcores must be greater than 0\n");
+	if (distribute_qps_to_lcores(nb_lcores, nb_qps, &qps_per_lcore) < 0)
+		rte_exit(EXIT_FAILURE, "Failed to distribute queues to lcores!\n");
 	ret = init_port(&nb_max_payload, rules_file,
 			&nb_max_matches, nb_qps);
 	if (ret < 0)
@@ -535,12 +643,27 @@ main(int argc, char **argv)
 	if (job_len > nb_max_payload)
 		rte_exit(EXIT_FAILURE, "Error, not enough jobs to cover input.\n");
 
-	ret = run_regex(nb_jobs, perf_mode,
-			nb_iterations, nb_max_matches, nb_qps,
-			data_buf, data_len, job_len);
-	if (ret < 0) {
-		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
+	rgxc = rte_malloc(NULL, sizeof(*rgxc) * nb_lcores, 0);
+	if (!rgxc)
+		rte_exit(EXIT_FAILURE, "Falied to create Regex Conf\n");
+	for (i = 0; i < nb_lcores; i++) {
+		rgxc[i] = (struct regex_conf){
+			.nb_jobs = nb_jobs,
+			.perf_mode = perf_mode,
+			.nb_iterations = nb_iterations,
+			.nb_max_matches = nb_max_matches,
+			.nb_qps = qps_per_lcore[i].nb_qps,
+			.qp_id_base = qps_per_lcore[i].qp_id_base,
+			.data_buf = data_buf,
+			.data_len = data_len,
+			.job_len = job_len,
+		};
+		rte_eal_remote_launch(run_regex, &rgxc[i],
+				      qps_per_lcore[i].lcore_id);
 	}
+	rte_eal_mp_wait_lcore();
 	rte_free(data_buf);
+	rte_free(rgxc);
+	rte_free(qps_per_lcore);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v1 5/6] app/regex: support performance measurements per QP
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
                   ` (3 preceding siblings ...)
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 4/6] app/regex: support multi cores Ophir Munk
@ 2020-12-16 16:49 ` Ophir Munk
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
  2020-12-17 11:52 ` [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ori Kam
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

Up to this commit measuring the parsing elapsed time and Giga bits per
second performance was done on the aggregation of all QPs (per core).
This commit seperates the time measurements per individual QP.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 720eb1c..f305186 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -48,6 +48,8 @@ struct qp_params {
 	struct rte_regex_ops **ops;
 	struct job_ctx *jobs_ctx;
 	char *buf;
+	time_t start;
+	time_t end;
 };
 
 struct qps_per_lcore {
@@ -324,8 +326,6 @@ run_regex(void *args)
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
 	int res = 0;
-	time_t start;
-	time_t end;
 	double time;
 	struct rte_mempool *mbuf_mp;
 	struct qp_params *qp;
@@ -418,9 +418,10 @@ run_regex(void *args)
 
 		qp->buf = buf;
 		qp->total_matches = 0;
+		qp->start = 0;
+		qp->end = 0;
 	}
 
-	start = clock();
 	for (i = 0; i < nb_iterations; i++) {
 		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 			qp = &qps[qp_id];
@@ -431,6 +432,8 @@ run_regex(void *args)
 			update = false;
 			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 				qp = &qps[qp_id];
+				if (!qp->start)
+					qp->start = clock();
 				if (qp->total_dequeue < actual_jobs) {
 					struct rte_regex_ops **
 						cur_ops_to_enqueue = qp->ops +
@@ -461,22 +464,30 @@ run_regex(void *args)
 							qp->total_enqueue -
 							qp->total_dequeue);
 					update = true;
+				} else {
+					if (!qp->end)
+						qp->end = clock();
 				}
+
 			}
 		} while (update);
 	}
-	end = clock();
-	time = ((double)end - start) / CLOCKS_PER_SEC;
-	printf("Job len = %ld Bytes\n",  job_len);
-	printf("Time = %lf sec\n",  time);
-	printf("Perf = %lf Gbps\n",
-	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
-		1000000000.0);
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
+		printf("Core=%u QP=%u\n", rte_lcore_id(), qp_id + qp_id_base);
+		printf("Job len = %ld Bytes\n",  job_len);
+		printf("Time = %lf sec\n",  time);
+		printf("Perf = %lf Gbps\n\n",
+				(((double)actual_jobs * job_len *
+				nb_iterations * 8) / time) /
+				1000000000.0);
+	}
 
 	if (rgxc->perf_mode)
 		goto end;
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
-		printf("\n############ QP id=%u ############\n", qp_id);
+		printf("\n############ Core=%u QP=%u ############\n",
+		       rte_lcore_id(), qp_id + qp_id_base);
 		qp = &qps[qp_id];
 		/* Log results per job. */
 		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v1 6/6] app/regex: replace Linux clock() API with rdtsc
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
                   ` (4 preceding siblings ...)
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 5/6] app/regex: support performance measurements per QP Ophir Munk
@ 2020-12-16 16:49 ` Ophir Munk
  2020-12-17 11:52 ` [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ori Kam
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-16 16:49 UTC (permalink / raw)
  To: Ori Kam, dev, Raslan Darawsheh; +Cc: Ophir Munk, Thomas Monjalon

Performance measurement (elapsed time and Gbps) are based on Linux
clock() API. The resolution is improved by replacing the clock() API
with rte_rdtsc_precise() API.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index f305186..ce0ede2 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -48,8 +48,8 @@ struct qp_params {
 	struct rte_regex_ops **ops;
 	struct job_ctx *jobs_ctx;
 	char *buf;
-	time_t start;
-	time_t end;
+	uint64_t start;
+	uint64_t cycles;
 };
 
 struct qps_per_lcore {
@@ -326,7 +326,7 @@ run_regex(void *args)
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
 	int res = 0;
-	double time;
+	long double time;
 	struct rte_mempool *mbuf_mp;
 	struct qp_params *qp;
 	struct qp_params *qps = NULL;
@@ -419,7 +419,7 @@ run_regex(void *args)
 		qp->buf = buf;
 		qp->total_matches = 0;
 		qp->start = 0;
-		qp->end = 0;
+		qp->cycles = 0;
 	}
 
 	for (i = 0; i < nb_iterations; i++) {
@@ -432,9 +432,8 @@ run_regex(void *args)
 			update = false;
 			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 				qp = &qps[qp_id];
-				if (!qp->start)
-					qp->start = clock();
 				if (qp->total_dequeue < actual_jobs) {
+					qp->start = rte_rdtsc_precise();
 					struct rte_regex_ops **
 						cur_ops_to_enqueue = qp->ops +
 						qp->total_enqueue;
@@ -463,24 +462,21 @@ run_regex(void *args)
 							cur_ops_to_dequeue,
 							qp->total_enqueue -
 							qp->total_dequeue);
+					qp->cycles +=
+					     (rte_rdtsc_precise() - qp->start);
 					update = true;
-				} else {
-					if (!qp->end)
-						qp->end = clock();
 				}
-
 			}
 		} while (update);
 	}
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
-		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
-		printf("Core=%u QP=%u\n", rte_lcore_id(), qp_id + qp_id_base);
-		printf("Job len = %ld Bytes\n",  job_len);
-		printf("Time = %lf sec\n",  time);
-		printf("Perf = %lf Gbps\n\n",
-				(((double)actual_jobs * job_len *
-				nb_iterations * 8) / time) /
-				1000000000.0);
+		qp = &qps[qp_id];
+		time = (long double)qp->cycles / rte_get_timer_hz();
+		printf("Core=%u QP=%u Job=%ld Bytes Time=%Lf sec Perf=%Lf "
+		       "Gbps\n", rte_lcore_id(), qp_id + qp_id_base,
+		       job_len, time,
+		       (((double)actual_jobs * job_len * nb_iterations * 8)
+		       / time) / 1000000000.0);
 	}
 
 	if (rgxc->perf_mode)
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support
  2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
                   ` (5 preceding siblings ...)
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
@ 2020-12-17 11:52 ` Ori Kam
  6 siblings, 0 replies; 28+ messages in thread
From: Ori Kam @ 2020-12-17 11:52 UTC (permalink / raw)
  To: Ophir Munk, dev, Raslan Darawsheh; +Cc: Ophir Munk, NBU-Contact-Thomas Monjalon

Hi Ophir,

You should also update the testregex.rst file.

Best,
Ori

> -----Original Message-----
> From: Ophir Munk <ophirmu@nvidia.com>
> Sent: Wednesday, December 16, 2020 6:49 PM
> Subject: [PATCH v1 0/6] regex multi Q with multi cores support
> 
> This patchset enhances the regex application to support multi Q with multi
> cores.
> 
> v1: initial release
> 
> Ophir Munk (6):
>   app/regex: move mem pool creation to worker routine
>   app/regex: support multi QPs
>   app/regex: read data file once at startup
>   app/regex: support multi cores
>   app/regex: support performance measurements per QP
>   app/regex: replace Linux clock() API with rdtsc
> 
>  app/test-regex/main.c | 519 ++++++++++++++++++++++++++++++++++++-------
> -------
>  1 file changed, 373 insertions(+), 146 deletions(-)
> 
> --
> 2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support
  2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
@ 2020-12-20 10:41   ` Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
                       ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

This patchset enhances the regex application to support multi Q with multi cores.

v1: Initial release.
v2: Update documentation (testregex.rst).
    Fix checkpath mispelling errors.

Ophir Munk (6):
  app/regex: move mem pool creation to worker routine
  app/regex: support multi QPs
  app/regex: read data file once at startup
  app/regex: support multi cores
  app/regex: support performance measurements per QP
  app/regex: replace Linux clock() API with rdtsc

 app/test-regex/main.c          | 519 +++++++++++++++++++++++++++++------------
 doc/guides/tools/testregex.rst |  30 ++-
 2 files changed, 398 insertions(+), 151 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 1/6] app/regex: move mem pool creation to worker routine
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
@ 2020-12-20 10:41     ` Ophir Munk
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 2/6] app/regex: support multi QPs Ophir Munk
                       ` (5 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

Function rte_pktmbuf_pool_create() is moved from init_port() routine to
run_regex() routine. Looking forward on multi core support - init_port()
will be called only once as part of application startup while mem pool
creation should be called multiple times (per core).

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index ac6152d..cb2a065 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -163,8 +163,7 @@ read_file(char *file, char **buf)
 }
 
 static int
-init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
-	  uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
+init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 {
 	uint16_t id;
 	uint16_t num_devs;
@@ -187,14 +186,6 @@ init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
 		return -EINVAL;
 	}
 
-	*mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
-					  0, MBUF_SIZE, rte_socket_id());
-	if (*mbuf_mp == NULL) {
-		printf("Error, can't create memory pool\n");
-		res = -ENOMEM;
-		goto error;
-	}
-
 	rules_len = read_file(rules_file, &rules);
 	if (rules_len < 0) {
 		printf("Error, can't read rules files.\n");
@@ -237,8 +228,6 @@ init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
 error:
 	if (rules)
 		rte_free(rules);
-	if (*mbuf_mp)
-		rte_mempool_free(*mbuf_mp);
 	return res;
 }
 
@@ -248,7 +237,7 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 }
 
 static int
-run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
+run_regex(uint32_t nb_jobs,
 	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
 	  char *data_file, uint8_t nb_max_matches)
 {
@@ -273,9 +262,17 @@ run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
 	time_t end;
 	double time;
 	struct job_ctx *jobs_ctx;
+	struct rte_mempool *mbuf_mp;
 
 	shinfo.free_cb = extbuf_free_cb;
 
+	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
+			0, MBUF_SIZE, rte_socket_id());
+	if (mbuf_mp == NULL) {
+		printf("Error, can't create memory pool\n");
+		return -ENOMEM;
+	}
+
 	ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
 	if (!ops) {
 		printf("Error, can't allocate memory for ops.\n");
@@ -409,6 +406,9 @@ run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
 	rte_free(jobs_ctx);
 	if (buf)
 		rte_free(buf);
+	if (mbuf_mp)
+		rte_mempool_free(mbuf_mp);
+
 	return res;
 }
 
@@ -417,7 +417,6 @@ main(int argc, char **argv)
 {
 	char rules_file[MAX_FILE_NAME];
 	char data_file[MAX_FILE_NAME];
-	struct rte_mempool *mbuf_mp = NULL;
 	uint32_t nb_jobs = 0;
 	uint16_t nb_max_payload = 0;
 	bool perf_mode = 0;
@@ -434,16 +433,13 @@ main(int argc, char **argv)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
 			   &perf_mode, &nb_iterations);
 
-	ret = init_port(&mbuf_mp, nb_jobs, &nb_max_payload, rules_file,
-			&nb_max_matches);
+	ret = init_port(&nb_max_payload, rules_file, &nb_max_matches);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
-	ret = run_regex(mbuf_mp, nb_jobs, nb_max_payload, perf_mode,
+	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
 			nb_iterations, data_file, nb_max_matches);
 	if (ret < 0) {
-		rte_mempool_free(mbuf_mp);
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
-	rte_mempool_free(mbuf_mp);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 2/6] app/regex: support multi QPs
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
@ 2020-12-20 10:41     ` Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 3/6] app/regex: read data file once at startup Ophir Munk
                       ` (4 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

Up to this commit the regex application used one QP which was assigned a
number of jobs, each with a different segment of a file to parse.  This
commit adds support for multiple QPs assignments. All QPs will be
assigned the same number of jobs, with the same segments of file to
parse. It will enable comparing functionality with different numbers of
QPs. All queues are managed on one core with one thread. This commit
focuses on changing routines API to support multi QPs, mainly, QP scalar
variables are replaced by per-QP struct instance.  The enqueue/dequeue
operations are interleaved as follows:
 enqueue(QP #1)
 enqueue(QP #2)
 ...
 enqueue(QP #n)
 dequeue(QP #1)
 dequeue(QP #2)
 ...
 dequeue(QP #n)

A new parameter 'nb_qps' was added to configure the number of QPs:
 --nb_qps <num of qps>.
If not configured, nb_qps is set to 1 by default.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 322 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 204 insertions(+), 118 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index cb2a065..d225267 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -33,12 +33,22 @@ enum app_args {
 	ARG_NUM_OF_JOBS,
 	ARG_PERF_MODE,
 	ARG_NUM_OF_ITERATIONS,
+	ARG_NUM_OF_QPS,
 };
 
 struct job_ctx {
 	struct rte_mbuf *mbuf;
 };
 
+struct qp_params {
+	uint32_t total_enqueue;
+	uint32_t total_dequeue;
+	uint32_t total_matches;
+	struct rte_regex_ops **ops;
+	struct job_ctx *jobs_ctx;
+	char *buf;
+};
+
 static void
 usage(const char *prog_name)
 {
@@ -47,13 +57,15 @@ usage(const char *prog_name)
 		" --data NAME: data file to use\n"
 		" --nb_jobs: number of jobs to use\n"
 		" --perf N: only outputs the performance data\n"
-		" --nb_iter N: number of iteration to run\n",
+		" --nb_iter N: number of iteration to run\n"
+		" --nb_qps N: number of queues to use\n",
 		prog_name);
 }
 
 static void
 args_parse(int argc, char **argv, char *rules_file, char *data_file,
-	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations)
+	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations,
+	   uint32_t *nb_qps)
 {
 	char **argvopt;
 	int opt;
@@ -71,6 +83,8 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		{ "perf", 0, 0, ARG_PERF_MODE},
 		/* Number of iterations to run with perf test */
 		{ "nb_iter", 1, 0, ARG_NUM_OF_ITERATIONS},
+		/* Number of QPs. */
+		{ "nb_qps", 1, 0, ARG_NUM_OF_QPS},
 		/* End of options */
 		{ 0, 0, 0, 0 }
 	};
@@ -104,6 +118,9 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		case ARG_NUM_OF_ITERATIONS:
 			*nb_iterations = atoi(optarg);
 			break;
+		case ARG_NUM_OF_QPS:
+			*nb_qps = atoi(optarg);
+			break;
 		case ARG_HELP:
 			usage("RegEx test app");
 			break;
@@ -163,15 +180,17 @@ read_file(char *file, char **buf)
 }
 
 static int
-init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
+init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches,
+	  uint32_t nb_qps)
 {
 	uint16_t id;
+	uint16_t qp_id;
 	uint16_t num_devs;
 	char *rules = NULL;
 	long rules_len;
 	struct rte_regexdev_info info;
 	struct rte_regexdev_config dev_conf = {
-		.nb_queue_pairs = 1,
+		.nb_queue_pairs = nb_qps,
 		.nb_groups = 1,
 	};
 	struct rte_regexdev_qp_conf qp_conf = {
@@ -203,7 +222,8 @@ init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 		*nb_max_matches = info.max_matches;
 		*nb_max_payload = info.max_payload_size;
 		if (info.regexdev_capa & RTE_REGEXDEV_SUPP_MATCH_AS_END_F)
-			dev_conf.dev_cfg_flags |= RTE_REGEXDEV_CFG_MATCH_AS_END_F;
+			dev_conf.dev_cfg_flags |=
+			RTE_REGEXDEV_CFG_MATCH_AS_END_F;
 		dev_conf.nb_max_matches = info.max_matches;
 		dev_conf.nb_rules_per_group = info.max_rules_per_group;
 		dev_conf.rule_db_len = rules_len;
@@ -214,12 +234,16 @@ init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 			goto error;
 		}
 		if (info.regexdev_capa & RTE_REGEXDEV_CAPA_QUEUE_PAIR_OOS_F)
-			qp_conf.qp_conf_flags |= RTE_REGEX_QUEUE_PAIR_CFG_OOS_F;
-		res = rte_regexdev_queue_pair_setup(id, 0, &qp_conf);
-		if (res < 0) {
-			printf("Error, can't setup queue pair for device %d.\n",
-			       id);
-			goto error;
+			qp_conf.qp_conf_flags |=
+			RTE_REGEX_QUEUE_PAIR_CFG_OOS_F;
+		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+			res = rte_regexdev_queue_pair_setup(id, qp_id,
+							    &qp_conf);
+			if (res < 0) {
+				printf("Error, can't setup queue pair %u for "
+				       "device %d.\n", qp_id, id);
+				goto error;
+			}
 		}
 		printf(":: initializing device: %d done\n", id);
 	}
@@ -239,122 +263,171 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 static int
 run_regex(uint32_t nb_jobs,
 	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
-	  char *data_file, uint8_t nb_max_matches)
+	  char *data_file, uint8_t nb_max_matches, uint32_t nb_qps)
 {
 	char *buf = NULL;
-	long buf_len;
-	long job_len;
+	long buf_len = 0;
+	long job_len = 0;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
-	struct rte_regex_ops **ops;
+	uint16_t qp_id;
 	uint16_t dev_id = 0;
-	uint16_t qp_id = 0;
 	uint8_t nb_matches;
 	struct rte_regexdev_match *match;
-	long pos = 0;
+	long pos;
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
-	uint32_t total_enqueue = 0;
-	uint32_t total_dequeue = 0;
-	uint32_t total_matches = 0;
 	int res = 0;
 	time_t start;
 	time_t end;
 	double time;
-	struct job_ctx *jobs_ctx;
 	struct rte_mempool *mbuf_mp;
+	struct qp_params *qp;
+	struct qp_params *qps = NULL;
+	bool update;
+	uint16_t qps_used = 0;
 
 	shinfo.free_cb = extbuf_free_cb;
-
-	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
+	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs * nb_qps, 0,
 			0, MBUF_SIZE, rte_socket_id());
 	if (mbuf_mp == NULL) {
 		printf("Error, can't create memory pool\n");
 		return -ENOMEM;
 	}
 
-	ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
-	if (!ops) {
-		printf("Error, can't allocate memory for ops.\n");
-		return -ENOMEM;
-	}
-
-	jobs_ctx = rte_malloc(NULL, sizeof(struct job_ctx)*nb_jobs, 0);
-	if (!jobs_ctx) {
-		printf("Error, can't allocate memory for jobs_ctx.\n");
-		return -ENOMEM;
+	qps = rte_malloc(NULL, sizeof(*qps) * nb_qps, 0);
+	if (!qps) {
+		printf("Error, can't allocate memory for QPs\n");
+		res = -ENOMEM;
+		goto end;
 	}
 
-	/* Allocate the jobs and assign each job with an mbuf. */
-	for (i = 0; i < nb_jobs; i++) {
-		ops[i] = rte_malloc(NULL, sizeof(*ops[0]) + nb_max_matches *
-				    sizeof(struct rte_regexdev_match), 0);
-		if (!ops[i]) {
-			printf("Error, can't allocate memory for op.\n");
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		struct rte_regex_ops **ops;
+		struct job_ctx *jobs_ctx;
+
+		qps_used++;
+		qp = &qps[qp_id];
+		qp->jobs_ctx = NULL;
+		qp->buf = NULL;
+		qp->ops = ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
+		if (!ops) {
+			printf("Error, can't allocate memory for ops.\n");
 			res = -ENOMEM;
 			goto end;
 		}
-		ops[i]->mbuf = rte_pktmbuf_alloc(mbuf_mp);
-		if (!ops[i]->mbuf) {
-			printf("Error, can't attach mbuf.\n");
+
+		qp->jobs_ctx = jobs_ctx =
+			rte_malloc(NULL, sizeof(*jobs_ctx) * nb_jobs, 0);
+		if (!jobs_ctx) {
+			printf("Error, can't allocate memory for jobs_ctx.\n");
 			res = -ENOMEM;
 			goto end;
 		}
-	}
 
-	buf_len = read_file(data_file, &buf);
-	if (buf_len <= 0) {
-		printf("Error, can't read file, or file is empty.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		/* Allocate the jobs and assign each job with an mbuf. */
+		for (i = 0; i < nb_jobs; i++) {
+			ops[i] = rte_malloc(NULL, sizeof(*ops[0]) +
+					nb_max_matches *
+					sizeof(struct rte_regexdev_match), 0);
+			if (!ops[i]) {
+				printf("Error, can't allocate "
+				       "memory for op.\n");
+				res = -ENOMEM;
+				goto end;
+			}
+			ops[i]->mbuf = rte_pktmbuf_alloc(mbuf_mp);
+			if (!ops[i]->mbuf) {
+				printf("Error, can't attach mbuf.\n");
+				res = -ENOMEM;
+				goto end;
+			}
+		}
 
-	job_len = buf_len / nb_jobs;
-	if (job_len == 0) {
-		printf("Error, To many jobs, for the given input.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		buf_len = read_file(data_file, &buf);
+		if (buf_len <= 0) {
+			printf("Error, can't read file, or file is empty.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
 
-	if (job_len > nb_max_payload) {
-		printf("Error, not enough jobs to cover input.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		job_len = buf_len / nb_jobs;
+		if (job_len == 0) {
+			printf("Error, To many jobs, for the given input.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
+
+		if (job_len > nb_max_payload) {
+			printf("Error, not enough jobs to cover input.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
 
-	/* Assign each mbuf with the data to handle. */
-	for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
-		long act_job_len = RTE_MIN(job_len, buf_len - pos);
-		rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
-					  act_job_len, &shinfo);
-		jobs_ctx[i].mbuf = ops[i]->mbuf;
-		ops[i]->mbuf->data_len = job_len;
-		ops[i]->mbuf->pkt_len = act_job_len;
-		ops[i]->user_id = i;
-		ops[i]->group_id0 = 1;
-		pos += act_job_len;
-		actual_jobs++;
+		/* Assign each mbuf with the data to handle. */
+		actual_jobs = 0;
+		pos = 0;
+		for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
+			long act_job_len = RTE_MIN(job_len, buf_len - pos);
+			rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
+					act_job_len, &shinfo);
+			jobs_ctx[i].mbuf = ops[i]->mbuf;
+			ops[i]->mbuf->data_len = job_len;
+			ops[i]->mbuf->pkt_len = act_job_len;
+			ops[i]->user_id = i;
+			ops[i]->group_id0 = 1;
+			pos += act_job_len;
+			actual_jobs++;
+		}
+
+		qp->buf = buf;
+		qp->total_matches = 0;
 	}
 
 	start = clock();
 	for (i = 0; i < nb_iterations; i++) {
-		total_enqueue = 0;
-		total_dequeue = 0;
-		while (total_dequeue < actual_jobs) {
-			struct rte_regex_ops **cur_ops_to_enqueue = ops +
-				total_enqueue;
-			struct rte_regex_ops **cur_ops_to_dequeue = ops +
-				total_dequeue;
-
-			if (actual_jobs - total_enqueue)
-				total_enqueue += rte_regexdev_enqueue_burst
-					(dev_id, qp_id, cur_ops_to_enqueue,
-					 actual_jobs - total_enqueue);
-
-			total_dequeue += rte_regexdev_dequeue_burst
-				(dev_id, qp_id, cur_ops_to_dequeue,
-				 total_enqueue - total_dequeue);
+		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+			qp = &qps[qp_id];
+			qp->total_enqueue = 0;
+			qp->total_dequeue = 0;
 		}
+		do {
+			update = false;
+			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+				qp = &qps[qp_id];
+				if (qp->total_dequeue < actual_jobs) {
+					struct rte_regex_ops **
+						cur_ops_to_enqueue = qp->ops +
+						qp->total_enqueue;
+
+					if (actual_jobs - qp->total_enqueue)
+						qp->total_enqueue +=
+						rte_regexdev_enqueue_burst
+							(dev_id,
+							qp_id,
+							cur_ops_to_enqueue,
+							actual_jobs -
+							qp->total_enqueue);
+				}
+			}
+			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+				qp = &qps[qp_id];
+				if (qp->total_dequeue < actual_jobs) {
+					struct rte_regex_ops **
+						cur_ops_to_dequeue = qp->ops +
+						qp->total_dequeue;
+
+					qp->total_dequeue +=
+						rte_regexdev_dequeue_burst
+							(dev_id,
+							qp_id,
+							cur_ops_to_dequeue,
+							qp->total_enqueue -
+							qp->total_dequeue);
+					update = true;
+				}
+			}
+		} while (update);
 	}
 	end = clock();
 	time = ((double)end - start) / CLOCKS_PER_SEC;
@@ -364,51 +437,59 @@ run_regex(uint32_t nb_jobs,
 	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
 		1000000000.0);
 
-	if (!perf_mode) {
+	if (perf_mode)
+		goto end;
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		printf("\n############ QP id=%u ############\n", qp_id);
+		qp = &qps[qp_id];
 		/* Log results per job. */
-		for (d_ind = 0; d_ind < total_dequeue; d_ind++) {
-			nb_matches = ops[d_ind % actual_jobs]->nb_matches;
+		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
+			nb_matches = qp->ops[d_ind % actual_jobs]->nb_matches;
 			printf("Job id %"PRIu64" number of matches = %d\n",
-			       ops[d_ind]->user_id, nb_matches);
-			total_matches += nb_matches;
-			match = ops[d_ind % actual_jobs]->matches;
+					qp->ops[d_ind]->user_id, nb_matches);
+			qp->total_matches += nb_matches;
+			match = qp->ops[d_ind % actual_jobs]->matches;
 			for (i = 0; i < nb_matches; i++) {
-				printf("match %d, rule = %d, start = %d,len = %d\n",
+				printf("match %d, rule = %d, "
+				       "start = %d,len = %d\n",
 				       i, match->rule_id, match->start_offset,
 				       match->len);
 				match++;
 			}
 		}
-		printf("Total matches = %d\n", total_matches);
+		printf("Total matches = %d\n", qp->total_matches);
 		printf("All Matches:\n");
-
 		/* Log absolute results. */
-		for (d_ind = 0; d_ind < total_dequeue; d_ind++) {
-			nb_matches = ops[d_ind % actual_jobs]->nb_matches;
-			total_matches += nb_matches;
-			match = ops[d_ind % actual_jobs]->matches;
+		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
+			nb_matches = qp->ops[d_ind % actual_jobs]->nb_matches;
+			qp->total_matches += nb_matches;
+			match = qp->ops[d_ind % actual_jobs]->matches;
 			for (i = 0; i < nb_matches; i++) {
 				printf("start = %ld, len = %d, rule = %d\n",
-				       match->start_offset + d_ind * job_len,
-				       match->len, match->rule_id);
+						match->start_offset +
+						d_ind * job_len,
+						match->len, match->rule_id);
 				match++;
 			}
 		}
 	}
 end:
-	for (i = 0; i < actual_jobs; i++) {
-		if (ops[i])
-			rte_free(ops[i]);
-		if (jobs_ctx[i].mbuf)
-			rte_pktmbuf_free(jobs_ctx[i].mbuf);
+	for (qp_id = 0; qp_id < qps_used; qp_id++) {
+		qp = &qps[qp_id];
+		for (i = 0; i < actual_jobs && qp->ops; i++)
+			rte_free(qp->ops[i]);
+		rte_free(qp->ops);
+		qp->ops = NULL;
+		for (i = 0; i < actual_jobs && qp->jobs_ctx; i++)
+			rte_pktmbuf_free(qp->jobs_ctx[i].mbuf);
+		rte_free(qp->jobs_ctx);
+		qp->jobs_ctx = NULL;
+		rte_free(qp->buf);
+		qp->buf = NULL;
 	}
-	rte_free(ops);
-	rte_free(jobs_ctx);
-	if (buf)
-		rte_free(buf);
 	if (mbuf_mp)
 		rte_mempool_free(mbuf_mp);
-
+	rte_free(qps);
 	return res;
 }
 
@@ -418,12 +499,14 @@ main(int argc, char **argv)
 	char rules_file[MAX_FILE_NAME];
 	char data_file[MAX_FILE_NAME];
 	uint32_t nb_jobs = 0;
-	uint16_t nb_max_payload = 0;
 	bool perf_mode = 0;
 	uint32_t nb_iterations = 0;
-	uint8_t nb_max_matches = 0;
 	int ret;
+	uint16_t nb_max_payload = 0;
+	uint8_t nb_max_matches = 0;
+	uint32_t nb_qps = 1;
 
+	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "EAL init failed\n");
@@ -431,13 +514,16 @@ main(int argc, char **argv)
 	argv += ret;
 	if (argc > 1)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
-			   &perf_mode, &nb_iterations);
+				&perf_mode, &nb_iterations, &nb_qps);
 
-	ret = init_port(&nb_max_payload, rules_file, &nb_max_matches);
+	if (nb_qps == 0)
+		rte_exit(EXIT_FAILURE, "Number of QPs must be greater than 0\n");
+	ret = init_port(&nb_max_payload, rules_file,
+			&nb_max_matches, nb_qps);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
 	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
-			nb_iterations, data_file, nb_max_matches);
+			nb_iterations, data_file, nb_max_matches, nb_qps);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 3/6] app/regex: read data file once at startup
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 2/6] app/regex: support multi QPs Ophir Munk
@ 2020-12-20 10:41     ` Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 4/6] app/regex: support multi cores Ophir Munk
                       ` (3 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

Up to this commit the input data file was read from scratch for each QP,
which is redundant. Starting from this commit the data file is read only
once at startup. Each QP will clone the data.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 63 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index d225267..9bafd02 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -180,6 +180,19 @@ read_file(char *file, char **buf)
 }
 
 static int
+clone_buf(char *data_buf, char **buf, long data_len)
+{
+	char *dest_buf;
+	dest_buf =
+		rte_malloc(NULL, sizeof(char) * (data_len + 1), 4096);
+	if (!dest_buf)
+		return -ENOMEM;
+	memcpy(dest_buf, data_buf, data_len + 1);
+	*buf = dest_buf;
+	return 0;
+}
+
+static int
 init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches,
 	  uint32_t nb_qps)
 {
@@ -262,12 +275,11 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 
 static int
 run_regex(uint32_t nb_jobs,
-	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
-	  char *data_file, uint8_t nb_max_matches, uint32_t nb_qps)
+	  bool perf_mode, uint32_t nb_iterations,
+	  uint8_t nb_max_matches, uint32_t nb_qps,
+	  char *data_buf, long data_len, long job_len)
 {
 	char *buf = NULL;
-	long buf_len = 0;
-	long job_len = 0;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
 	uint16_t qp_id;
@@ -344,22 +356,8 @@ run_regex(uint32_t nb_jobs,
 			}
 		}
 
-		buf_len = read_file(data_file, &buf);
-		if (buf_len <= 0) {
-			printf("Error, can't read file, or file is empty.\n");
-			res = -EXIT_FAILURE;
-			goto end;
-		}
-
-		job_len = buf_len / nb_jobs;
-		if (job_len == 0) {
-			printf("Error, To many jobs, for the given input.\n");
-			res = -EXIT_FAILURE;
-			goto end;
-		}
-
-		if (job_len > nb_max_payload) {
-			printf("Error, not enough jobs to cover input.\n");
+		if (clone_buf(data_buf, &buf, data_len)) {
+			printf("Error, can't clone buf.\n");
 			res = -EXIT_FAILURE;
 			goto end;
 		}
@@ -367,8 +365,8 @@ run_regex(uint32_t nb_jobs,
 		/* Assign each mbuf with the data to handle. */
 		actual_jobs = 0;
 		pos = 0;
-		for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
-			long act_job_len = RTE_MIN(job_len, buf_len - pos);
+		for (i = 0; (pos < data_len) && (i < nb_jobs) ; i++) {
+			long act_job_len = RTE_MIN(job_len, data_len - pos);
 			rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
 					act_job_len, &shinfo);
 			jobs_ctx[i].mbuf = ops[i]->mbuf;
@@ -505,6 +503,9 @@ main(int argc, char **argv)
 	uint16_t nb_max_payload = 0;
 	uint8_t nb_max_matches = 0;
 	uint32_t nb_qps = 1;
+	char *data_buf;
+	long data_len;
+	long job_len;
 
 	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
@@ -522,10 +523,24 @@ main(int argc, char **argv)
 			&nb_max_matches, nb_qps);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
-	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
-			nb_iterations, data_file, nb_max_matches, nb_qps);
+
+	data_len = read_file(data_file, &data_buf);
+	if (data_len <= 0)
+		rte_exit(EXIT_FAILURE, "Error, can't read file, or file is empty.\n");
+
+	job_len = data_len / nb_jobs;
+	if (job_len == 0)
+		rte_exit(EXIT_FAILURE, "Error, To many jobs, for the given input.\n");
+
+	if (job_len > nb_max_payload)
+		rte_exit(EXIT_FAILURE, "Error, not enough jobs to cover input.\n");
+
+	ret = run_regex(nb_jobs, perf_mode,
+			nb_iterations, nb_max_matches, nb_qps,
+			data_buf, data_len, job_len);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
+	rte_free(data_buf);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 4/6] app/regex: support multi cores
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
                       ` (2 preceding siblings ...)
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 3/6] app/regex: read data file once at startup Ophir Munk
@ 2020-12-20 10:41     ` Ophir Munk
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP Ophir Munk
                       ` (2 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

Up to this commit the regex application was running with multiple QPs on
a single core.  This commit adds the option to specify a number of cores
on which multiple QPs will run.
A new parameter 'nb_lcores' was added to configure the number of cores:
--nb_lcores <num of cores>.
If not configured the number of cores is set to 1 by default.  On
application startup a few initial steps occur by the main core: the
number of QPs and cores are parsed.  The QPs are distributed as evenly
as possible on the cores.  The regex device and all QPs are initialized.
The data file is read and saved in a buffer. Then for each core the
application calls rte_eal_remote_launch() with the worker routine
(run_regex) as its parameter.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c          | 155 ++++++++++++++++++++++++++++++++++++-----
 doc/guides/tools/testregex.rst |  30 ++++++--
 2 files changed, 164 insertions(+), 21 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 9bafd02..2948d3e 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -34,6 +34,7 @@ enum app_args {
 	ARG_PERF_MODE,
 	ARG_NUM_OF_ITERATIONS,
 	ARG_NUM_OF_QPS,
+	ARG_NUM_OF_LCORES,
 };
 
 struct job_ctx {
@@ -49,6 +50,26 @@ struct qp_params {
 	char *buf;
 };
 
+struct qps_per_lcore {
+	unsigned int lcore_id;
+	int socket;
+	uint16_t qp_id_base;
+	uint16_t nb_qps;
+};
+
+struct regex_conf {
+	uint32_t nb_jobs;
+	bool perf_mode;
+	uint32_t nb_iterations;
+	char *data_file;
+	uint8_t nb_max_matches;
+	uint32_t nb_qps;
+	uint16_t qp_id_base;
+	char *data_buf;
+	long data_len;
+	long job_len;
+};
+
 static void
 usage(const char *prog_name)
 {
@@ -58,14 +79,15 @@ usage(const char *prog_name)
 		" --nb_jobs: number of jobs to use\n"
 		" --perf N: only outputs the performance data\n"
 		" --nb_iter N: number of iteration to run\n"
-		" --nb_qps N: number of queues to use\n",
+		" --nb_qps N: number of queues to use\n"
+		" --nb_lcores N: number of lcores to use\n",
 		prog_name);
 }
 
 static void
 args_parse(int argc, char **argv, char *rules_file, char *data_file,
 	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations,
-	   uint32_t *nb_qps)
+	   uint32_t *nb_qps, uint32_t *nb_lcores)
 {
 	char **argvopt;
 	int opt;
@@ -85,6 +107,8 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		{ "nb_iter", 1, 0, ARG_NUM_OF_ITERATIONS},
 		/* Number of QPs. */
 		{ "nb_qps", 1, 0, ARG_NUM_OF_QPS},
+		/* Number of lcores. */
+		{ "nb_lcores", 1, 0, ARG_NUM_OF_LCORES},
 		/* End of options */
 		{ 0, 0, 0, 0 }
 	};
@@ -121,6 +145,9 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		case ARG_NUM_OF_QPS:
 			*nb_qps = atoi(optarg);
 			break;
+		case ARG_NUM_OF_LCORES:
+			*nb_lcores = atoi(optarg);
+			break;
 		case ARG_HELP:
 			usage("RegEx test app");
 			break;
@@ -274,11 +301,18 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 }
 
 static int
-run_regex(uint32_t nb_jobs,
-	  bool perf_mode, uint32_t nb_iterations,
-	  uint8_t nb_max_matches, uint32_t nb_qps,
-	  char *data_buf, long data_len, long job_len)
+run_regex(void *args)
 {
+	struct regex_conf *rgxc = args;
+	uint32_t nb_jobs = rgxc->nb_jobs;
+	uint32_t nb_iterations = rgxc->nb_iterations;
+	uint8_t nb_max_matches = rgxc->nb_max_matches;
+	uint32_t nb_qps = rgxc->nb_qps;
+	uint16_t qp_id_base  = rgxc->qp_id_base;
+	char *data_buf = rgxc->data_buf;
+	long data_len = rgxc->data_len;
+	long job_len = rgxc->job_len;
+
 	char *buf = NULL;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
@@ -298,9 +332,13 @@ run_regex(uint32_t nb_jobs,
 	struct qp_params *qps = NULL;
 	bool update;
 	uint16_t qps_used = 0;
+	char mbuf_pool[16];
 
 	shinfo.free_cb = extbuf_free_cb;
-	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs * nb_qps, 0,
+	snprintf(mbuf_pool,
+		 sizeof(mbuf_pool),
+		 "mbuf_pool_%2u", qp_id_base);
+	mbuf_mp = rte_pktmbuf_pool_create(mbuf_pool, nb_jobs * nb_qps, 0,
 			0, MBUF_SIZE, rte_socket_id());
 	if (mbuf_mp == NULL) {
 		printf("Error, can't create memory pool\n");
@@ -402,7 +440,7 @@ run_regex(uint32_t nb_jobs,
 						qp->total_enqueue +=
 						rte_regexdev_enqueue_burst
 							(dev_id,
-							qp_id,
+							qp_id_base + qp_id,
 							cur_ops_to_enqueue,
 							actual_jobs -
 							qp->total_enqueue);
@@ -418,7 +456,7 @@ run_regex(uint32_t nb_jobs,
 					qp->total_dequeue +=
 						rte_regexdev_dequeue_burst
 							(dev_id,
-							qp_id,
+							qp_id_base + qp_id,
 							cur_ops_to_dequeue,
 							qp->total_enqueue -
 							qp->total_dequeue);
@@ -435,7 +473,7 @@ run_regex(uint32_t nb_jobs,
 	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
 		1000000000.0);
 
-	if (perf_mode)
+	if (rgxc->perf_mode)
 		goto end;
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 		printf("\n############ QP id=%u ############\n", qp_id);
@@ -491,6 +529,67 @@ run_regex(uint32_t nb_jobs,
 	return res;
 }
 
+static int
+distribute_qps_to_lcores(uint32_t nb_cores, uint32_t nb_qps,
+			 struct qps_per_lcore **qpl)
+{
+	int socket;
+	unsigned lcore_id;
+	uint32_t i;
+	uint16_t min_qp_id;
+	uint16_t max_qp_id;
+	struct qps_per_lcore *qps_per_lcore;
+	uint32_t detected_lcores;
+
+	if (nb_qps < nb_cores) {
+		nb_cores = nb_qps;
+		printf("Reducing number of cores to number of QPs (%u)\n",
+		       nb_cores);
+	}
+	/* Allocate qps_per_lcore array */
+	qps_per_lcore =
+		rte_malloc(NULL, sizeof(*qps_per_lcore) * nb_cores, 0);
+	if (!qps_per_lcore)
+		rte_exit(EXIT_FAILURE, "Failed to create qps_per_lcore array\n");
+	*qpl = qps_per_lcore;
+	detected_lcores = 0;
+	min_qp_id = 0;
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		if (detected_lcores >= nb_cores)
+			break;
+		qps_per_lcore[detected_lcores].lcore_id = lcore_id;
+		socket = rte_lcore_to_socket_id(lcore_id);
+		if (socket == SOCKET_ID_ANY)
+			socket = 0;
+		qps_per_lcore[detected_lcores].socket = socket;
+		qps_per_lcore[detected_lcores].qp_id_base = min_qp_id;
+		max_qp_id = min_qp_id + nb_qps / nb_cores - 1;
+		if (nb_qps % nb_cores > detected_lcores)
+			max_qp_id++;
+		qps_per_lcore[detected_lcores].nb_qps = max_qp_id -
+							min_qp_id + 1;
+		min_qp_id = max_qp_id + 1;
+		detected_lcores++;
+	}
+	if (detected_lcores != nb_cores)
+		return -1;
+
+	for (i = 0; i < detected_lcores; i++) {
+		printf("===> Core %d: allocated queues: ",
+		       qps_per_lcore[i].lcore_id);
+		min_qp_id = qps_per_lcore[i].qp_id_base;
+		max_qp_id =
+			qps_per_lcore[i].qp_id_base + qps_per_lcore[i].nb_qps;
+		while (min_qp_id < max_qp_id) {
+			printf("%u ", min_qp_id);
+			min_qp_id++;
+		}
+		printf("\n");
+	}
+	return 0;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -506,6 +605,10 @@ main(int argc, char **argv)
 	char *data_buf;
 	long data_len;
 	long job_len;
+	uint32_t nb_lcores = 1;
+	struct regex_conf *rgxc;
+	uint32_t i;
+	struct qps_per_lcore *qps_per_lcore;
 
 	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
@@ -515,10 +618,15 @@ main(int argc, char **argv)
 	argv += ret;
 	if (argc > 1)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
-				&perf_mode, &nb_iterations, &nb_qps);
+				&perf_mode, &nb_iterations, &nb_qps,
+				&nb_lcores);
 
 	if (nb_qps == 0)
 		rte_exit(EXIT_FAILURE, "Number of QPs must be greater than 0\n");
+	if (nb_lcores == 0)
+		rte_exit(EXIT_FAILURE, "Number of lcores must be greater than 0\n");
+	if (distribute_qps_to_lcores(nb_lcores, nb_qps, &qps_per_lcore) < 0)
+		rte_exit(EXIT_FAILURE, "Failed to distribute queues to lcores!\n");
 	ret = init_port(&nb_max_payload, rules_file,
 			&nb_max_matches, nb_qps);
 	if (ret < 0)
@@ -535,12 +643,27 @@ main(int argc, char **argv)
 	if (job_len > nb_max_payload)
 		rte_exit(EXIT_FAILURE, "Error, not enough jobs to cover input.\n");
 
-	ret = run_regex(nb_jobs, perf_mode,
-			nb_iterations, nb_max_matches, nb_qps,
-			data_buf, data_len, job_len);
-	if (ret < 0) {
-		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
+	rgxc = rte_malloc(NULL, sizeof(*rgxc) * nb_lcores, 0);
+	if (!rgxc)
+		rte_exit(EXIT_FAILURE, "Failed to create Regex Conf\n");
+	for (i = 0; i < nb_lcores; i++) {
+		rgxc[i] = (struct regex_conf){
+			.nb_jobs = nb_jobs,
+			.perf_mode = perf_mode,
+			.nb_iterations = nb_iterations,
+			.nb_max_matches = nb_max_matches,
+			.nb_qps = qps_per_lcore[i].nb_qps,
+			.qp_id_base = qps_per_lcore[i].qp_id_base,
+			.data_buf = data_buf,
+			.data_len = data_len,
+			.job_len = job_len,
+		};
+		rte_eal_remote_launch(run_regex, &rgxc[i],
+				      qps_per_lcore[i].lcore_id);
 	}
+	rte_eal_mp_wait_lcore();
 	rte_free(data_buf);
+	rte_free(rgxc);
+	rte_free(qps_per_lcore);
 	return EXIT_SUCCESS;
 }
diff --git a/doc/guides/tools/testregex.rst b/doc/guides/tools/testregex.rst
index 112b2bb..a59acd9 100644
--- a/doc/guides/tools/testregex.rst
+++ b/doc/guides/tools/testregex.rst
@@ -7,13 +7,28 @@ dpdk-test-regex Tool
 The ``dpdk-test-regex`` tool is a Data Plane Development Kit (DPDK)
 application that allows functional testing and performance measurement for
 the RegEx PMDs.
-The test supports only one core and one PMD.
+
 It is based on precompiled rule file, and an input file, both of them can
 be selected using command-line options.
 
 In general case, each PMD has its own rule file.
 
-The test outputs the following data:
+By default the test supports one QP per core, however a higher number of cores
+and QPs can be configured. The QPs are evenly distributed on the cores. All QPs
+are assigned the same number of segments of input file to parse.  Given n QPs
+(per core) - the enqueue/dequeue RegEx operations are interleaved as follows::
+
+ enqueue(QP #1)
+ enqueue(QP #2)
+ ...
+ enqueue(QP #n)
+ dequeue(QP #1)
+ dequeue(QP #2)
+ ...
+ dequeue(QP #n)
+
+
+The test outputs the following data per QP and core:
 
 * Performance, in gigabit per second.
 
@@ -26,8 +41,6 @@ The test outputs the following data:
 Limitations
 ~~~~~~~~~~~
 
-* Only one queue is supported.
-
 * Supports only precompiled rules.
 
 
@@ -43,6 +56,12 @@ Application Options
 ``--nb_jobs N``
   number of jobs to use
 
+``--nb_qps N``
+  number of QPs to use
+
+``--nb_lcores N``
+  number of cores to use
+
 ``--perf N``
   only outputs the performance data
 
@@ -70,4 +89,5 @@ The data file, will be used as a source data for the RegEx to work on.
 
 The tool has a number of command line options. Here is the sample command line::
 
-   ./dpdk-test-regex -a 83:00.0 -- --rules rule_file.rof2 --data data_file.txt --job 100
+   ./dpdk-test-regex -a 83:00.0 -- --rules rule_file.rof2 --data data_file.txt --job 100 \
+     --nb_qps 4 --nb_lcores 2
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
                       ` (3 preceding siblings ...)
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 4/6] app/regex: support multi cores Ophir Munk
@ 2020-12-20 10:41     ` Ophir Munk
  2021-01-08  9:08       ` Thomas Monjalon
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
  2021-01-04 14:01     ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ori Kam
  6 siblings, 1 reply; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

Up to this commit measuring the parsing elapsed time and Giga bits per
second performance was done on the aggregation of all QPs (per core).
This commit separates the time measurements per individual QP.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 2948d3e..e845655 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -48,6 +48,8 @@ struct qp_params {
 	struct rte_regex_ops **ops;
 	struct job_ctx *jobs_ctx;
 	char *buf;
+	time_t start;
+	time_t end;
 };
 
 struct qps_per_lcore {
@@ -324,8 +326,6 @@ run_regex(void *args)
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
 	int res = 0;
-	time_t start;
-	time_t end;
 	double time;
 	struct rte_mempool *mbuf_mp;
 	struct qp_params *qp;
@@ -418,9 +418,10 @@ run_regex(void *args)
 
 		qp->buf = buf;
 		qp->total_matches = 0;
+		qp->start = 0;
+		qp->end = 0;
 	}
 
-	start = clock();
 	for (i = 0; i < nb_iterations; i++) {
 		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 			qp = &qps[qp_id];
@@ -431,6 +432,8 @@ run_regex(void *args)
 			update = false;
 			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 				qp = &qps[qp_id];
+				if (!qp->start)
+					qp->start = clock();
 				if (qp->total_dequeue < actual_jobs) {
 					struct rte_regex_ops **
 						cur_ops_to_enqueue = qp->ops +
@@ -461,22 +464,30 @@ run_regex(void *args)
 							qp->total_enqueue -
 							qp->total_dequeue);
 					update = true;
+				} else {
+					if (!qp->end)
+						qp->end = clock();
 				}
+
 			}
 		} while (update);
 	}
-	end = clock();
-	time = ((double)end - start) / CLOCKS_PER_SEC;
-	printf("Job len = %ld Bytes\n",  job_len);
-	printf("Time = %lf sec\n",  time);
-	printf("Perf = %lf Gbps\n",
-	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
-		1000000000.0);
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
+		printf("Core=%u QP=%u\n", rte_lcore_id(), qp_id + qp_id_base);
+		printf("Job len = %ld Bytes\n",  job_len);
+		printf("Time = %lf sec\n",  time);
+		printf("Perf = %lf Gbps\n\n",
+				(((double)actual_jobs * job_len *
+				nb_iterations * 8) / time) /
+				1000000000.0);
+	}
 
 	if (rgxc->perf_mode)
 		goto end;
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
-		printf("\n############ QP id=%u ############\n", qp_id);
+		printf("\n############ Core=%u QP=%u ############\n",
+		       rte_lcore_id(), qp_id + qp_id_base);
 		qp = &qps[qp_id];
 		/* Log results per job. */
 		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v2 6/6] app/regex: replace Linux clock() API with rdtsc
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
                       ` (4 preceding siblings ...)
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP Ophir Munk
@ 2020-12-20 10:41     ` Ophir Munk
  2021-01-04 14:01     ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ori Kam
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2020-12-20 10:41 UTC (permalink / raw)
  To: dev, Ori Kam, Ophir Munk; +Cc: Thomas Monjalon

Performance measurement (elapsed time and Gbps) are based on Linux
clock() API. The resolution is improved by replacing the clock() API
with rte_rdtsc_precise() API.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
---
 app/test-regex/main.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index e845655..aea4fa6 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -48,8 +48,8 @@ struct qp_params {
 	struct rte_regex_ops **ops;
 	struct job_ctx *jobs_ctx;
 	char *buf;
-	time_t start;
-	time_t end;
+	uint64_t start;
+	uint64_t cycles;
 };
 
 struct qps_per_lcore {
@@ -326,7 +326,7 @@ run_regex(void *args)
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
 	int res = 0;
-	double time;
+	long double time;
 	struct rte_mempool *mbuf_mp;
 	struct qp_params *qp;
 	struct qp_params *qps = NULL;
@@ -419,7 +419,7 @@ run_regex(void *args)
 		qp->buf = buf;
 		qp->total_matches = 0;
 		qp->start = 0;
-		qp->end = 0;
+		qp->cycles = 0;
 	}
 
 	for (i = 0; i < nb_iterations; i++) {
@@ -432,9 +432,8 @@ run_regex(void *args)
 			update = false;
 			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 				qp = &qps[qp_id];
-				if (!qp->start)
-					qp->start = clock();
 				if (qp->total_dequeue < actual_jobs) {
+					qp->start = rte_rdtsc_precise();
 					struct rte_regex_ops **
 						cur_ops_to_enqueue = qp->ops +
 						qp->total_enqueue;
@@ -463,24 +462,21 @@ run_regex(void *args)
 							cur_ops_to_dequeue,
 							qp->total_enqueue -
 							qp->total_dequeue);
+					qp->cycles +=
+					     (rte_rdtsc_precise() - qp->start);
 					update = true;
-				} else {
-					if (!qp->end)
-						qp->end = clock();
 				}
-
 			}
 		} while (update);
 	}
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
-		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
-		printf("Core=%u QP=%u\n", rte_lcore_id(), qp_id + qp_id_base);
-		printf("Job len = %ld Bytes\n",  job_len);
-		printf("Time = %lf sec\n",  time);
-		printf("Perf = %lf Gbps\n\n",
-				(((double)actual_jobs * job_len *
-				nb_iterations * 8) / time) /
-				1000000000.0);
+		qp = &qps[qp_id];
+		time = (long double)qp->cycles / rte_get_timer_hz();
+		printf("Core=%u QP=%u Job=%ld Bytes Time=%Lf sec Perf=%Lf "
+		       "Gbps\n", rte_lcore_id(), qp_id + qp_id_base,
+		       job_len, time,
+		       (((double)actual_jobs * job_len * nb_iterations * 8)
+		       / time) / 1000000000.0);
 	}
 
 	if (rgxc->perf_mode)
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support
  2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
                       ` (5 preceding siblings ...)
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
@ 2021-01-04 14:01     ` Ori Kam
  6 siblings, 0 replies; 28+ messages in thread
From: Ori Kam @ 2021-01-04 14:01 UTC (permalink / raw)
  To: Ophir Munk, dev, Ophir Munk; +Cc: NBU-Contact-Thomas Monjalon



> -----Original Message-----
> From: Ophir Munk <ophirmu@nvidia.com>
> Sent: Sunday, December 20, 2020 12:42 PM
> To: dev@dpdk.org; Ori Kam <orika@nvidia.com>; Ophir Munk
> <ophirmu@nvidia.com>
> Cc: NBU-Contact-Thomas Monjalon <thomas@monjalon.net>
> Subject: [PATCH v2 0/6] regex multi Q with multi cores support
> 
> This patchset enhances the regex application to support multi Q with multi
> cores.
> 
> v1: Initial release.
> v2: Update documentation (testregex.rst).
>     Fix checkpath mispelling errors.
> 
> Ophir Munk (6):
>   app/regex: move mem pool creation to worker routine
>   app/regex: support multi QPs
>   app/regex: read data file once at startup
>   app/regex: support multi cores
>   app/regex: support performance measurements per QP
>   app/regex: replace Linux clock() API with rdtsc
> 
>  app/test-regex/main.c          | 519 +++++++++++++++++++++++++++++-----------
> -
>  doc/guides/tools/testregex.rst |  30 ++-
>  2 files changed, 398 insertions(+), 151 deletions(-)
> 
> --
> 2.8.4

Series-acked-by:  Ori Kam <orika@nvidia.com>

Thanks,
Ori



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP Ophir Munk
@ 2021-01-08  9:08       ` Thomas Monjalon
  2021-01-10 11:16         ` Ophir Munk
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Monjalon @ 2021-01-08  9:08 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Ori Kam

20/12/2020 11:41, Ophir Munk:
> Up to this commit measuring the parsing elapsed time and Giga bits per
> second performance was done on the aggregation of all QPs (per core).
> This commit separates the time measurements per individual QP.
> 
> Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
> ---
> --- a/app/test-regex/main.c
> +++ b/app/test-regex/main.c
> +	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
> +		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;

This line triggers an error with PPC compiler:
error: ‘qp’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
   time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;




^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support
  2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
@ 2021-01-10 11:10       ` Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
                           ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

This patchset enhances the regex application to support multi Q with multi cores.

v1: Initial release.
v2: Update documentation (testregex.rst).
v3: fix an error in commit "app/regex: support performance measurements per QP"
    The following line triggered an error with PPC compiler:
    error: qp may be used uninitialized in this function [-Werror=maybe-uninitialized]
    time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
    Adding Acked-By

Ophir Munk (6):
  app/regex: move mem pool creation to worker routine
  app/regex: support multi QPs
  app/regex: read data file once at startup
  app/regex: support multi cores
  app/regex: support performance measurements per QP
  app/regex: replace Linux clock() API with rdtsc

 app/test-regex/main.c          | 519 +++++++++++++++++++++++++++++------------
 doc/guides/tools/testregex.rst |  30 ++-
 2 files changed, 398 insertions(+), 151 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 1/6] app/regex: move mem pool creation to worker routine
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
@ 2021-01-10 11:10         ` Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 2/6] app/regex: support multi QPs Ophir Munk
                           ` (5 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

Function rte_pktmbuf_pool_create() is moved from init_port() routine to
run_regex() routine. Looking forward on multi core support - init_port()
will be called only once as part of application startup while mem pool
creation should be called multiple times (per core).

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
---
 app/test-regex/main.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index ac6152d..cb2a065 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -163,8 +163,7 @@ read_file(char *file, char **buf)
 }
 
 static int
-init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
-	  uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
+init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 {
 	uint16_t id;
 	uint16_t num_devs;
@@ -187,14 +186,6 @@ init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
 		return -EINVAL;
 	}
 
-	*mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
-					  0, MBUF_SIZE, rte_socket_id());
-	if (*mbuf_mp == NULL) {
-		printf("Error, can't create memory pool\n");
-		res = -ENOMEM;
-		goto error;
-	}
-
 	rules_len = read_file(rules_file, &rules);
 	if (rules_len < 0) {
 		printf("Error, can't read rules files.\n");
@@ -237,8 +228,6 @@ init_port(struct rte_mempool **mbuf_mp, uint32_t nb_jobs,
 error:
 	if (rules)
 		rte_free(rules);
-	if (*mbuf_mp)
-		rte_mempool_free(*mbuf_mp);
 	return res;
 }
 
@@ -248,7 +237,7 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 }
 
 static int
-run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
+run_regex(uint32_t nb_jobs,
 	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
 	  char *data_file, uint8_t nb_max_matches)
 {
@@ -273,9 +262,17 @@ run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
 	time_t end;
 	double time;
 	struct job_ctx *jobs_ctx;
+	struct rte_mempool *mbuf_mp;
 
 	shinfo.free_cb = extbuf_free_cb;
 
+	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
+			0, MBUF_SIZE, rte_socket_id());
+	if (mbuf_mp == NULL) {
+		printf("Error, can't create memory pool\n");
+		return -ENOMEM;
+	}
+
 	ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
 	if (!ops) {
 		printf("Error, can't allocate memory for ops.\n");
@@ -409,6 +406,9 @@ run_regex(struct rte_mempool *mbuf_mp, uint32_t nb_jobs,
 	rte_free(jobs_ctx);
 	if (buf)
 		rte_free(buf);
+	if (mbuf_mp)
+		rte_mempool_free(mbuf_mp);
+
 	return res;
 }
 
@@ -417,7 +417,6 @@ main(int argc, char **argv)
 {
 	char rules_file[MAX_FILE_NAME];
 	char data_file[MAX_FILE_NAME];
-	struct rte_mempool *mbuf_mp = NULL;
 	uint32_t nb_jobs = 0;
 	uint16_t nb_max_payload = 0;
 	bool perf_mode = 0;
@@ -434,16 +433,13 @@ main(int argc, char **argv)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
 			   &perf_mode, &nb_iterations);
 
-	ret = init_port(&mbuf_mp, nb_jobs, &nb_max_payload, rules_file,
-			&nb_max_matches);
+	ret = init_port(&nb_max_payload, rules_file, &nb_max_matches);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
-	ret = run_regex(mbuf_mp, nb_jobs, nb_max_payload, perf_mode,
+	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
 			nb_iterations, data_file, nb_max_matches);
 	if (ret < 0) {
-		rte_mempool_free(mbuf_mp);
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
-	rte_mempool_free(mbuf_mp);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 2/6] app/regex: support multi QPs
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
@ 2021-01-10 11:10         ` Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 3/6] app/regex: read data file once at startup Ophir Munk
                           ` (4 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

Up to this commit the regex application used one QP which was assigned a
number of jobs, each with a different segment of a file to parse.  This
commit adds support for multiple QPs assignments. All QPs will be
assigned the same number of jobs, with the same segments of file to
parse. It will enable comparing functionality with different numbers of
QPs. All queues are managed on one core with one thread. This commit
focuses on changing routines API to support multi QPs, mainly, QP scalar
variables are replaced by per-QP struct instance.  The enqueue/dequeue
operations are interleaved as follows:
 enqueue(QP #1)
 enqueue(QP #2)
 ...
 enqueue(QP #n)
 dequeue(QP #1)
 dequeue(QP #2)
 ...
 dequeue(QP #n)

A new parameter 'nb_qps' was added to configure the number of QPs:
 --nb_qps <num of qps>.
If not configured, nb_qps is set to 1 by default.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
---
 app/test-regex/main.c | 322 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 204 insertions(+), 118 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index cb2a065..d225267 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -33,12 +33,22 @@ enum app_args {
 	ARG_NUM_OF_JOBS,
 	ARG_PERF_MODE,
 	ARG_NUM_OF_ITERATIONS,
+	ARG_NUM_OF_QPS,
 };
 
 struct job_ctx {
 	struct rte_mbuf *mbuf;
 };
 
+struct qp_params {
+	uint32_t total_enqueue;
+	uint32_t total_dequeue;
+	uint32_t total_matches;
+	struct rte_regex_ops **ops;
+	struct job_ctx *jobs_ctx;
+	char *buf;
+};
+
 static void
 usage(const char *prog_name)
 {
@@ -47,13 +57,15 @@ usage(const char *prog_name)
 		" --data NAME: data file to use\n"
 		" --nb_jobs: number of jobs to use\n"
 		" --perf N: only outputs the performance data\n"
-		" --nb_iter N: number of iteration to run\n",
+		" --nb_iter N: number of iteration to run\n"
+		" --nb_qps N: number of queues to use\n",
 		prog_name);
 }
 
 static void
 args_parse(int argc, char **argv, char *rules_file, char *data_file,
-	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations)
+	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations,
+	   uint32_t *nb_qps)
 {
 	char **argvopt;
 	int opt;
@@ -71,6 +83,8 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		{ "perf", 0, 0, ARG_PERF_MODE},
 		/* Number of iterations to run with perf test */
 		{ "nb_iter", 1, 0, ARG_NUM_OF_ITERATIONS},
+		/* Number of QPs. */
+		{ "nb_qps", 1, 0, ARG_NUM_OF_QPS},
 		/* End of options */
 		{ 0, 0, 0, 0 }
 	};
@@ -104,6 +118,9 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		case ARG_NUM_OF_ITERATIONS:
 			*nb_iterations = atoi(optarg);
 			break;
+		case ARG_NUM_OF_QPS:
+			*nb_qps = atoi(optarg);
+			break;
 		case ARG_HELP:
 			usage("RegEx test app");
 			break;
@@ -163,15 +180,17 @@ read_file(char *file, char **buf)
 }
 
 static int
-init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
+init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches,
+	  uint32_t nb_qps)
 {
 	uint16_t id;
+	uint16_t qp_id;
 	uint16_t num_devs;
 	char *rules = NULL;
 	long rules_len;
 	struct rte_regexdev_info info;
 	struct rte_regexdev_config dev_conf = {
-		.nb_queue_pairs = 1,
+		.nb_queue_pairs = nb_qps,
 		.nb_groups = 1,
 	};
 	struct rte_regexdev_qp_conf qp_conf = {
@@ -203,7 +222,8 @@ init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 		*nb_max_matches = info.max_matches;
 		*nb_max_payload = info.max_payload_size;
 		if (info.regexdev_capa & RTE_REGEXDEV_SUPP_MATCH_AS_END_F)
-			dev_conf.dev_cfg_flags |= RTE_REGEXDEV_CFG_MATCH_AS_END_F;
+			dev_conf.dev_cfg_flags |=
+			RTE_REGEXDEV_CFG_MATCH_AS_END_F;
 		dev_conf.nb_max_matches = info.max_matches;
 		dev_conf.nb_rules_per_group = info.max_rules_per_group;
 		dev_conf.rule_db_len = rules_len;
@@ -214,12 +234,16 @@ init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches)
 			goto error;
 		}
 		if (info.regexdev_capa & RTE_REGEXDEV_CAPA_QUEUE_PAIR_OOS_F)
-			qp_conf.qp_conf_flags |= RTE_REGEX_QUEUE_PAIR_CFG_OOS_F;
-		res = rte_regexdev_queue_pair_setup(id, 0, &qp_conf);
-		if (res < 0) {
-			printf("Error, can't setup queue pair for device %d.\n",
-			       id);
-			goto error;
+			qp_conf.qp_conf_flags |=
+			RTE_REGEX_QUEUE_PAIR_CFG_OOS_F;
+		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+			res = rte_regexdev_queue_pair_setup(id, qp_id,
+							    &qp_conf);
+			if (res < 0) {
+				printf("Error, can't setup queue pair %u for "
+				       "device %d.\n", qp_id, id);
+				goto error;
+			}
 		}
 		printf(":: initializing device: %d done\n", id);
 	}
@@ -239,122 +263,171 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 static int
 run_regex(uint32_t nb_jobs,
 	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
-	  char *data_file, uint8_t nb_max_matches)
+	  char *data_file, uint8_t nb_max_matches, uint32_t nb_qps)
 {
 	char *buf = NULL;
-	long buf_len;
-	long job_len;
+	long buf_len = 0;
+	long job_len = 0;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
-	struct rte_regex_ops **ops;
+	uint16_t qp_id;
 	uint16_t dev_id = 0;
-	uint16_t qp_id = 0;
 	uint8_t nb_matches;
 	struct rte_regexdev_match *match;
-	long pos = 0;
+	long pos;
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
-	uint32_t total_enqueue = 0;
-	uint32_t total_dequeue = 0;
-	uint32_t total_matches = 0;
 	int res = 0;
 	time_t start;
 	time_t end;
 	double time;
-	struct job_ctx *jobs_ctx;
 	struct rte_mempool *mbuf_mp;
+	struct qp_params *qp;
+	struct qp_params *qps = NULL;
+	bool update;
+	uint16_t qps_used = 0;
 
 	shinfo.free_cb = extbuf_free_cb;
-
-	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs, 0,
+	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs * nb_qps, 0,
 			0, MBUF_SIZE, rte_socket_id());
 	if (mbuf_mp == NULL) {
 		printf("Error, can't create memory pool\n");
 		return -ENOMEM;
 	}
 
-	ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
-	if (!ops) {
-		printf("Error, can't allocate memory for ops.\n");
-		return -ENOMEM;
-	}
-
-	jobs_ctx = rte_malloc(NULL, sizeof(struct job_ctx)*nb_jobs, 0);
-	if (!jobs_ctx) {
-		printf("Error, can't allocate memory for jobs_ctx.\n");
-		return -ENOMEM;
+	qps = rte_malloc(NULL, sizeof(*qps) * nb_qps, 0);
+	if (!qps) {
+		printf("Error, can't allocate memory for QPs\n");
+		res = -ENOMEM;
+		goto end;
 	}
 
-	/* Allocate the jobs and assign each job with an mbuf. */
-	for (i = 0; i < nb_jobs; i++) {
-		ops[i] = rte_malloc(NULL, sizeof(*ops[0]) + nb_max_matches *
-				    sizeof(struct rte_regexdev_match), 0);
-		if (!ops[i]) {
-			printf("Error, can't allocate memory for op.\n");
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		struct rte_regex_ops **ops;
+		struct job_ctx *jobs_ctx;
+
+		qps_used++;
+		qp = &qps[qp_id];
+		qp->jobs_ctx = NULL;
+		qp->buf = NULL;
+		qp->ops = ops = rte_malloc(NULL, sizeof(*ops) * nb_jobs, 0);
+		if (!ops) {
+			printf("Error, can't allocate memory for ops.\n");
 			res = -ENOMEM;
 			goto end;
 		}
-		ops[i]->mbuf = rte_pktmbuf_alloc(mbuf_mp);
-		if (!ops[i]->mbuf) {
-			printf("Error, can't attach mbuf.\n");
+
+		qp->jobs_ctx = jobs_ctx =
+			rte_malloc(NULL, sizeof(*jobs_ctx) * nb_jobs, 0);
+		if (!jobs_ctx) {
+			printf("Error, can't allocate memory for jobs_ctx.\n");
 			res = -ENOMEM;
 			goto end;
 		}
-	}
 
-	buf_len = read_file(data_file, &buf);
-	if (buf_len <= 0) {
-		printf("Error, can't read file, or file is empty.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		/* Allocate the jobs and assign each job with an mbuf. */
+		for (i = 0; i < nb_jobs; i++) {
+			ops[i] = rte_malloc(NULL, sizeof(*ops[0]) +
+					nb_max_matches *
+					sizeof(struct rte_regexdev_match), 0);
+			if (!ops[i]) {
+				printf("Error, can't allocate "
+				       "memory for op.\n");
+				res = -ENOMEM;
+				goto end;
+			}
+			ops[i]->mbuf = rte_pktmbuf_alloc(mbuf_mp);
+			if (!ops[i]->mbuf) {
+				printf("Error, can't attach mbuf.\n");
+				res = -ENOMEM;
+				goto end;
+			}
+		}
 
-	job_len = buf_len / nb_jobs;
-	if (job_len == 0) {
-		printf("Error, To many jobs, for the given input.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		buf_len = read_file(data_file, &buf);
+		if (buf_len <= 0) {
+			printf("Error, can't read file, or file is empty.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
 
-	if (job_len > nb_max_payload) {
-		printf("Error, not enough jobs to cover input.\n");
-		res = -EXIT_FAILURE;
-		goto end;
-	}
+		job_len = buf_len / nb_jobs;
+		if (job_len == 0) {
+			printf("Error, To many jobs, for the given input.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
+
+		if (job_len > nb_max_payload) {
+			printf("Error, not enough jobs to cover input.\n");
+			res = -EXIT_FAILURE;
+			goto end;
+		}
 
-	/* Assign each mbuf with the data to handle. */
-	for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
-		long act_job_len = RTE_MIN(job_len, buf_len - pos);
-		rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
-					  act_job_len, &shinfo);
-		jobs_ctx[i].mbuf = ops[i]->mbuf;
-		ops[i]->mbuf->data_len = job_len;
-		ops[i]->mbuf->pkt_len = act_job_len;
-		ops[i]->user_id = i;
-		ops[i]->group_id0 = 1;
-		pos += act_job_len;
-		actual_jobs++;
+		/* Assign each mbuf with the data to handle. */
+		actual_jobs = 0;
+		pos = 0;
+		for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
+			long act_job_len = RTE_MIN(job_len, buf_len - pos);
+			rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
+					act_job_len, &shinfo);
+			jobs_ctx[i].mbuf = ops[i]->mbuf;
+			ops[i]->mbuf->data_len = job_len;
+			ops[i]->mbuf->pkt_len = act_job_len;
+			ops[i]->user_id = i;
+			ops[i]->group_id0 = 1;
+			pos += act_job_len;
+			actual_jobs++;
+		}
+
+		qp->buf = buf;
+		qp->total_matches = 0;
 	}
 
 	start = clock();
 	for (i = 0; i < nb_iterations; i++) {
-		total_enqueue = 0;
-		total_dequeue = 0;
-		while (total_dequeue < actual_jobs) {
-			struct rte_regex_ops **cur_ops_to_enqueue = ops +
-				total_enqueue;
-			struct rte_regex_ops **cur_ops_to_dequeue = ops +
-				total_dequeue;
-
-			if (actual_jobs - total_enqueue)
-				total_enqueue += rte_regexdev_enqueue_burst
-					(dev_id, qp_id, cur_ops_to_enqueue,
-					 actual_jobs - total_enqueue);
-
-			total_dequeue += rte_regexdev_dequeue_burst
-				(dev_id, qp_id, cur_ops_to_dequeue,
-				 total_enqueue - total_dequeue);
+		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+			qp = &qps[qp_id];
+			qp->total_enqueue = 0;
+			qp->total_dequeue = 0;
 		}
+		do {
+			update = false;
+			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+				qp = &qps[qp_id];
+				if (qp->total_dequeue < actual_jobs) {
+					struct rte_regex_ops **
+						cur_ops_to_enqueue = qp->ops +
+						qp->total_enqueue;
+
+					if (actual_jobs - qp->total_enqueue)
+						qp->total_enqueue +=
+						rte_regexdev_enqueue_burst
+							(dev_id,
+							qp_id,
+							cur_ops_to_enqueue,
+							actual_jobs -
+							qp->total_enqueue);
+				}
+			}
+			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+				qp = &qps[qp_id];
+				if (qp->total_dequeue < actual_jobs) {
+					struct rte_regex_ops **
+						cur_ops_to_dequeue = qp->ops +
+						qp->total_dequeue;
+
+					qp->total_dequeue +=
+						rte_regexdev_dequeue_burst
+							(dev_id,
+							qp_id,
+							cur_ops_to_dequeue,
+							qp->total_enqueue -
+							qp->total_dequeue);
+					update = true;
+				}
+			}
+		} while (update);
 	}
 	end = clock();
 	time = ((double)end - start) / CLOCKS_PER_SEC;
@@ -364,51 +437,59 @@ run_regex(uint32_t nb_jobs,
 	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
 		1000000000.0);
 
-	if (!perf_mode) {
+	if (perf_mode)
+		goto end;
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		printf("\n############ QP id=%u ############\n", qp_id);
+		qp = &qps[qp_id];
 		/* Log results per job. */
-		for (d_ind = 0; d_ind < total_dequeue; d_ind++) {
-			nb_matches = ops[d_ind % actual_jobs]->nb_matches;
+		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
+			nb_matches = qp->ops[d_ind % actual_jobs]->nb_matches;
 			printf("Job id %"PRIu64" number of matches = %d\n",
-			       ops[d_ind]->user_id, nb_matches);
-			total_matches += nb_matches;
-			match = ops[d_ind % actual_jobs]->matches;
+					qp->ops[d_ind]->user_id, nb_matches);
+			qp->total_matches += nb_matches;
+			match = qp->ops[d_ind % actual_jobs]->matches;
 			for (i = 0; i < nb_matches; i++) {
-				printf("match %d, rule = %d, start = %d,len = %d\n",
+				printf("match %d, rule = %d, "
+				       "start = %d,len = %d\n",
 				       i, match->rule_id, match->start_offset,
 				       match->len);
 				match++;
 			}
 		}
-		printf("Total matches = %d\n", total_matches);
+		printf("Total matches = %d\n", qp->total_matches);
 		printf("All Matches:\n");
-
 		/* Log absolute results. */
-		for (d_ind = 0; d_ind < total_dequeue; d_ind++) {
-			nb_matches = ops[d_ind % actual_jobs]->nb_matches;
-			total_matches += nb_matches;
-			match = ops[d_ind % actual_jobs]->matches;
+		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
+			nb_matches = qp->ops[d_ind % actual_jobs]->nb_matches;
+			qp->total_matches += nb_matches;
+			match = qp->ops[d_ind % actual_jobs]->matches;
 			for (i = 0; i < nb_matches; i++) {
 				printf("start = %ld, len = %d, rule = %d\n",
-				       match->start_offset + d_ind * job_len,
-				       match->len, match->rule_id);
+						match->start_offset +
+						d_ind * job_len,
+						match->len, match->rule_id);
 				match++;
 			}
 		}
 	}
 end:
-	for (i = 0; i < actual_jobs; i++) {
-		if (ops[i])
-			rte_free(ops[i]);
-		if (jobs_ctx[i].mbuf)
-			rte_pktmbuf_free(jobs_ctx[i].mbuf);
+	for (qp_id = 0; qp_id < qps_used; qp_id++) {
+		qp = &qps[qp_id];
+		for (i = 0; i < actual_jobs && qp->ops; i++)
+			rte_free(qp->ops[i]);
+		rte_free(qp->ops);
+		qp->ops = NULL;
+		for (i = 0; i < actual_jobs && qp->jobs_ctx; i++)
+			rte_pktmbuf_free(qp->jobs_ctx[i].mbuf);
+		rte_free(qp->jobs_ctx);
+		qp->jobs_ctx = NULL;
+		rte_free(qp->buf);
+		qp->buf = NULL;
 	}
-	rte_free(ops);
-	rte_free(jobs_ctx);
-	if (buf)
-		rte_free(buf);
 	if (mbuf_mp)
 		rte_mempool_free(mbuf_mp);
-
+	rte_free(qps);
 	return res;
 }
 
@@ -418,12 +499,14 @@ main(int argc, char **argv)
 	char rules_file[MAX_FILE_NAME];
 	char data_file[MAX_FILE_NAME];
 	uint32_t nb_jobs = 0;
-	uint16_t nb_max_payload = 0;
 	bool perf_mode = 0;
 	uint32_t nb_iterations = 0;
-	uint8_t nb_max_matches = 0;
 	int ret;
+	uint16_t nb_max_payload = 0;
+	uint8_t nb_max_matches = 0;
+	uint32_t nb_qps = 1;
 
+	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "EAL init failed\n");
@@ -431,13 +514,16 @@ main(int argc, char **argv)
 	argv += ret;
 	if (argc > 1)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
-			   &perf_mode, &nb_iterations);
+				&perf_mode, &nb_iterations, &nb_qps);
 
-	ret = init_port(&nb_max_payload, rules_file, &nb_max_matches);
+	if (nb_qps == 0)
+		rte_exit(EXIT_FAILURE, "Number of QPs must be greater than 0\n");
+	ret = init_port(&nb_max_payload, rules_file,
+			&nb_max_matches, nb_qps);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
 	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
-			nb_iterations, data_file, nb_max_matches);
+			nb_iterations, data_file, nb_max_matches, nb_qps);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 3/6] app/regex: read data file once at startup
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 2/6] app/regex: support multi QPs Ophir Munk
@ 2021-01-10 11:10         ` Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 4/6] app/regex: support multi cores Ophir Munk
                           ` (3 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

Up to this commit the input data file was read from scratch for each QP,
which is redundant. Starting from this commit the data file is read only
once at startup. Each QP will clone the data.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
---
 app/test-regex/main.c | 63 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index d225267..9bafd02 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -180,6 +180,19 @@ read_file(char *file, char **buf)
 }
 
 static int
+clone_buf(char *data_buf, char **buf, long data_len)
+{
+	char *dest_buf;
+	dest_buf =
+		rte_malloc(NULL, sizeof(char) * (data_len + 1), 4096);
+	if (!dest_buf)
+		return -ENOMEM;
+	memcpy(dest_buf, data_buf, data_len + 1);
+	*buf = dest_buf;
+	return 0;
+}
+
+static int
 init_port(uint16_t *nb_max_payload, char *rules_file, uint8_t *nb_max_matches,
 	  uint32_t nb_qps)
 {
@@ -262,12 +275,11 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 
 static int
 run_regex(uint32_t nb_jobs,
-	  uint16_t nb_max_payload, bool perf_mode, uint32_t nb_iterations,
-	  char *data_file, uint8_t nb_max_matches, uint32_t nb_qps)
+	  bool perf_mode, uint32_t nb_iterations,
+	  uint8_t nb_max_matches, uint32_t nb_qps,
+	  char *data_buf, long data_len, long job_len)
 {
 	char *buf = NULL;
-	long buf_len = 0;
-	long job_len = 0;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
 	uint16_t qp_id;
@@ -344,22 +356,8 @@ run_regex(uint32_t nb_jobs,
 			}
 		}
 
-		buf_len = read_file(data_file, &buf);
-		if (buf_len <= 0) {
-			printf("Error, can't read file, or file is empty.\n");
-			res = -EXIT_FAILURE;
-			goto end;
-		}
-
-		job_len = buf_len / nb_jobs;
-		if (job_len == 0) {
-			printf("Error, To many jobs, for the given input.\n");
-			res = -EXIT_FAILURE;
-			goto end;
-		}
-
-		if (job_len > nb_max_payload) {
-			printf("Error, not enough jobs to cover input.\n");
+		if (clone_buf(data_buf, &buf, data_len)) {
+			printf("Error, can't clone buf.\n");
 			res = -EXIT_FAILURE;
 			goto end;
 		}
@@ -367,8 +365,8 @@ run_regex(uint32_t nb_jobs,
 		/* Assign each mbuf with the data to handle. */
 		actual_jobs = 0;
 		pos = 0;
-		for (i = 0; (pos < buf_len) && (i < nb_jobs) ; i++) {
-			long act_job_len = RTE_MIN(job_len, buf_len - pos);
+		for (i = 0; (pos < data_len) && (i < nb_jobs) ; i++) {
+			long act_job_len = RTE_MIN(job_len, data_len - pos);
 			rte_pktmbuf_attach_extbuf(ops[i]->mbuf, &buf[pos], 0,
 					act_job_len, &shinfo);
 			jobs_ctx[i].mbuf = ops[i]->mbuf;
@@ -505,6 +503,9 @@ main(int argc, char **argv)
 	uint16_t nb_max_payload = 0;
 	uint8_t nb_max_matches = 0;
 	uint32_t nb_qps = 1;
+	char *data_buf;
+	long data_len;
+	long job_len;
 
 	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
@@ -522,10 +523,24 @@ main(int argc, char **argv)
 			&nb_max_matches, nb_qps);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init port failed\n");
-	ret = run_regex(nb_jobs, nb_max_payload, perf_mode,
-			nb_iterations, data_file, nb_max_matches, nb_qps);
+
+	data_len = read_file(data_file, &data_buf);
+	if (data_len <= 0)
+		rte_exit(EXIT_FAILURE, "Error, can't read file, or file is empty.\n");
+
+	job_len = data_len / nb_jobs;
+	if (job_len == 0)
+		rte_exit(EXIT_FAILURE, "Error, To many jobs, for the given input.\n");
+
+	if (job_len > nb_max_payload)
+		rte_exit(EXIT_FAILURE, "Error, not enough jobs to cover input.\n");
+
+	ret = run_regex(nb_jobs, perf_mode,
+			nb_iterations, nb_max_matches, nb_qps,
+			data_buf, data_len, job_len);
 	if (ret < 0) {
 		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
 	}
+	rte_free(data_buf);
 	return EXIT_SUCCESS;
 }
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 4/6] app/regex: support multi cores
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
                           ` (2 preceding siblings ...)
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 3/6] app/regex: read data file once at startup Ophir Munk
@ 2021-01-10 11:10         ` Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 5/6] app/regex: support performance measurements per QP Ophir Munk
                           ` (2 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

Up to this commit the regex application was running with multiple QPs on
a single core.  This commit adds the option to specify a number of cores
on which multiple QPs will run.
A new parameter 'nb_lcores' was added to configure the number of cores:
--nb_lcores <num of cores>.
If not configured the number of cores is set to 1 by default.  On
application startup a few initial steps occur by the main core: the
number of QPs and cores are parsed.  The QPs are distributed as evenly
as possible on the cores.  The regex device and all QPs are initialized.
The data file is read and saved in a buffer. Then for each core the
application calls rte_eal_remote_launch() with the worker routine
(run_regex) as its parameter.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
---
 app/test-regex/main.c          | 155 ++++++++++++++++++++++++++++++++++++-----
 doc/guides/tools/testregex.rst |  30 ++++++--
 2 files changed, 164 insertions(+), 21 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 9bafd02..2948d3e 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -34,6 +34,7 @@ enum app_args {
 	ARG_PERF_MODE,
 	ARG_NUM_OF_ITERATIONS,
 	ARG_NUM_OF_QPS,
+	ARG_NUM_OF_LCORES,
 };
 
 struct job_ctx {
@@ -49,6 +50,26 @@ struct qp_params {
 	char *buf;
 };
 
+struct qps_per_lcore {
+	unsigned int lcore_id;
+	int socket;
+	uint16_t qp_id_base;
+	uint16_t nb_qps;
+};
+
+struct regex_conf {
+	uint32_t nb_jobs;
+	bool perf_mode;
+	uint32_t nb_iterations;
+	char *data_file;
+	uint8_t nb_max_matches;
+	uint32_t nb_qps;
+	uint16_t qp_id_base;
+	char *data_buf;
+	long data_len;
+	long job_len;
+};
+
 static void
 usage(const char *prog_name)
 {
@@ -58,14 +79,15 @@ usage(const char *prog_name)
 		" --nb_jobs: number of jobs to use\n"
 		" --perf N: only outputs the performance data\n"
 		" --nb_iter N: number of iteration to run\n"
-		" --nb_qps N: number of queues to use\n",
+		" --nb_qps N: number of queues to use\n"
+		" --nb_lcores N: number of lcores to use\n",
 		prog_name);
 }
 
 static void
 args_parse(int argc, char **argv, char *rules_file, char *data_file,
 	   uint32_t *nb_jobs, bool *perf_mode, uint32_t *nb_iterations,
-	   uint32_t *nb_qps)
+	   uint32_t *nb_qps, uint32_t *nb_lcores)
 {
 	char **argvopt;
 	int opt;
@@ -85,6 +107,8 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		{ "nb_iter", 1, 0, ARG_NUM_OF_ITERATIONS},
 		/* Number of QPs. */
 		{ "nb_qps", 1, 0, ARG_NUM_OF_QPS},
+		/* Number of lcores. */
+		{ "nb_lcores", 1, 0, ARG_NUM_OF_LCORES},
 		/* End of options */
 		{ 0, 0, 0, 0 }
 	};
@@ -121,6 +145,9 @@ args_parse(int argc, char **argv, char *rules_file, char *data_file,
 		case ARG_NUM_OF_QPS:
 			*nb_qps = atoi(optarg);
 			break;
+		case ARG_NUM_OF_LCORES:
+			*nb_lcores = atoi(optarg);
+			break;
 		case ARG_HELP:
 			usage("RegEx test app");
 			break;
@@ -274,11 +301,18 @@ extbuf_free_cb(void *addr __rte_unused, void *fcb_opaque __rte_unused)
 }
 
 static int
-run_regex(uint32_t nb_jobs,
-	  bool perf_mode, uint32_t nb_iterations,
-	  uint8_t nb_max_matches, uint32_t nb_qps,
-	  char *data_buf, long data_len, long job_len)
+run_regex(void *args)
 {
+	struct regex_conf *rgxc = args;
+	uint32_t nb_jobs = rgxc->nb_jobs;
+	uint32_t nb_iterations = rgxc->nb_iterations;
+	uint8_t nb_max_matches = rgxc->nb_max_matches;
+	uint32_t nb_qps = rgxc->nb_qps;
+	uint16_t qp_id_base  = rgxc->qp_id_base;
+	char *data_buf = rgxc->data_buf;
+	long data_len = rgxc->data_len;
+	long job_len = rgxc->job_len;
+
 	char *buf = NULL;
 	uint32_t actual_jobs = 0;
 	uint32_t i;
@@ -298,9 +332,13 @@ run_regex(uint32_t nb_jobs,
 	struct qp_params *qps = NULL;
 	bool update;
 	uint16_t qps_used = 0;
+	char mbuf_pool[16];
 
 	shinfo.free_cb = extbuf_free_cb;
-	mbuf_mp = rte_pktmbuf_pool_create("mbuf_pool", nb_jobs * nb_qps, 0,
+	snprintf(mbuf_pool,
+		 sizeof(mbuf_pool),
+		 "mbuf_pool_%2u", qp_id_base);
+	mbuf_mp = rte_pktmbuf_pool_create(mbuf_pool, nb_jobs * nb_qps, 0,
 			0, MBUF_SIZE, rte_socket_id());
 	if (mbuf_mp == NULL) {
 		printf("Error, can't create memory pool\n");
@@ -402,7 +440,7 @@ run_regex(uint32_t nb_jobs,
 						qp->total_enqueue +=
 						rte_regexdev_enqueue_burst
 							(dev_id,
-							qp_id,
+							qp_id_base + qp_id,
 							cur_ops_to_enqueue,
 							actual_jobs -
 							qp->total_enqueue);
@@ -418,7 +456,7 @@ run_regex(uint32_t nb_jobs,
 					qp->total_dequeue +=
 						rte_regexdev_dequeue_burst
 							(dev_id,
-							qp_id,
+							qp_id_base + qp_id,
 							cur_ops_to_dequeue,
 							qp->total_enqueue -
 							qp->total_dequeue);
@@ -435,7 +473,7 @@ run_regex(uint32_t nb_jobs,
 	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
 		1000000000.0);
 
-	if (perf_mode)
+	if (rgxc->perf_mode)
 		goto end;
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 		printf("\n############ QP id=%u ############\n", qp_id);
@@ -491,6 +529,67 @@ run_regex(uint32_t nb_jobs,
 	return res;
 }
 
+static int
+distribute_qps_to_lcores(uint32_t nb_cores, uint32_t nb_qps,
+			 struct qps_per_lcore **qpl)
+{
+	int socket;
+	unsigned lcore_id;
+	uint32_t i;
+	uint16_t min_qp_id;
+	uint16_t max_qp_id;
+	struct qps_per_lcore *qps_per_lcore;
+	uint32_t detected_lcores;
+
+	if (nb_qps < nb_cores) {
+		nb_cores = nb_qps;
+		printf("Reducing number of cores to number of QPs (%u)\n",
+		       nb_cores);
+	}
+	/* Allocate qps_per_lcore array */
+	qps_per_lcore =
+		rte_malloc(NULL, sizeof(*qps_per_lcore) * nb_cores, 0);
+	if (!qps_per_lcore)
+		rte_exit(EXIT_FAILURE, "Failed to create qps_per_lcore array\n");
+	*qpl = qps_per_lcore;
+	detected_lcores = 0;
+	min_qp_id = 0;
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		if (detected_lcores >= nb_cores)
+			break;
+		qps_per_lcore[detected_lcores].lcore_id = lcore_id;
+		socket = rte_lcore_to_socket_id(lcore_id);
+		if (socket == SOCKET_ID_ANY)
+			socket = 0;
+		qps_per_lcore[detected_lcores].socket = socket;
+		qps_per_lcore[detected_lcores].qp_id_base = min_qp_id;
+		max_qp_id = min_qp_id + nb_qps / nb_cores - 1;
+		if (nb_qps % nb_cores > detected_lcores)
+			max_qp_id++;
+		qps_per_lcore[detected_lcores].nb_qps = max_qp_id -
+							min_qp_id + 1;
+		min_qp_id = max_qp_id + 1;
+		detected_lcores++;
+	}
+	if (detected_lcores != nb_cores)
+		return -1;
+
+	for (i = 0; i < detected_lcores; i++) {
+		printf("===> Core %d: allocated queues: ",
+		       qps_per_lcore[i].lcore_id);
+		min_qp_id = qps_per_lcore[i].qp_id_base;
+		max_qp_id =
+			qps_per_lcore[i].qp_id_base + qps_per_lcore[i].nb_qps;
+		while (min_qp_id < max_qp_id) {
+			printf("%u ", min_qp_id);
+			min_qp_id++;
+		}
+		printf("\n");
+	}
+	return 0;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -506,6 +605,10 @@ main(int argc, char **argv)
 	char *data_buf;
 	long data_len;
 	long job_len;
+	uint32_t nb_lcores = 1;
+	struct regex_conf *rgxc;
+	uint32_t i;
+	struct qps_per_lcore *qps_per_lcore;
 
 	/* Init EAL. */
 	ret = rte_eal_init(argc, argv);
@@ -515,10 +618,15 @@ main(int argc, char **argv)
 	argv += ret;
 	if (argc > 1)
 		args_parse(argc, argv, rules_file, data_file, &nb_jobs,
-				&perf_mode, &nb_iterations, &nb_qps);
+				&perf_mode, &nb_iterations, &nb_qps,
+				&nb_lcores);
 
 	if (nb_qps == 0)
 		rte_exit(EXIT_FAILURE, "Number of QPs must be greater than 0\n");
+	if (nb_lcores == 0)
+		rte_exit(EXIT_FAILURE, "Number of lcores must be greater than 0\n");
+	if (distribute_qps_to_lcores(nb_lcores, nb_qps, &qps_per_lcore) < 0)
+		rte_exit(EXIT_FAILURE, "Failed to distribute queues to lcores!\n");
 	ret = init_port(&nb_max_payload, rules_file,
 			&nb_max_matches, nb_qps);
 	if (ret < 0)
@@ -535,12 +643,27 @@ main(int argc, char **argv)
 	if (job_len > nb_max_payload)
 		rte_exit(EXIT_FAILURE, "Error, not enough jobs to cover input.\n");
 
-	ret = run_regex(nb_jobs, perf_mode,
-			nb_iterations, nb_max_matches, nb_qps,
-			data_buf, data_len, job_len);
-	if (ret < 0) {
-		rte_exit(EXIT_FAILURE, "RegEx function failed\n");
+	rgxc = rte_malloc(NULL, sizeof(*rgxc) * nb_lcores, 0);
+	if (!rgxc)
+		rte_exit(EXIT_FAILURE, "Failed to create Regex Conf\n");
+	for (i = 0; i < nb_lcores; i++) {
+		rgxc[i] = (struct regex_conf){
+			.nb_jobs = nb_jobs,
+			.perf_mode = perf_mode,
+			.nb_iterations = nb_iterations,
+			.nb_max_matches = nb_max_matches,
+			.nb_qps = qps_per_lcore[i].nb_qps,
+			.qp_id_base = qps_per_lcore[i].qp_id_base,
+			.data_buf = data_buf,
+			.data_len = data_len,
+			.job_len = job_len,
+		};
+		rte_eal_remote_launch(run_regex, &rgxc[i],
+				      qps_per_lcore[i].lcore_id);
 	}
+	rte_eal_mp_wait_lcore();
 	rte_free(data_buf);
+	rte_free(rgxc);
+	rte_free(qps_per_lcore);
 	return EXIT_SUCCESS;
 }
diff --git a/doc/guides/tools/testregex.rst b/doc/guides/tools/testregex.rst
index 112b2bb..a59acd9 100644
--- a/doc/guides/tools/testregex.rst
+++ b/doc/guides/tools/testregex.rst
@@ -7,13 +7,28 @@ dpdk-test-regex Tool
 The ``dpdk-test-regex`` tool is a Data Plane Development Kit (DPDK)
 application that allows functional testing and performance measurement for
 the RegEx PMDs.
-The test supports only one core and one PMD.
+
 It is based on precompiled rule file, and an input file, both of them can
 be selected using command-line options.
 
 In general case, each PMD has its own rule file.
 
-The test outputs the following data:
+By default the test supports one QP per core, however a higher number of cores
+and QPs can be configured. The QPs are evenly distributed on the cores. All QPs
+are assigned the same number of segments of input file to parse.  Given n QPs
+(per core) - the enqueue/dequeue RegEx operations are interleaved as follows::
+
+ enqueue(QP #1)
+ enqueue(QP #2)
+ ...
+ enqueue(QP #n)
+ dequeue(QP #1)
+ dequeue(QP #2)
+ ...
+ dequeue(QP #n)
+
+
+The test outputs the following data per QP and core:
 
 * Performance, in gigabit per second.
 
@@ -26,8 +41,6 @@ The test outputs the following data:
 Limitations
 ~~~~~~~~~~~
 
-* Only one queue is supported.
-
 * Supports only precompiled rules.
 
 
@@ -43,6 +56,12 @@ Application Options
 ``--nb_jobs N``
   number of jobs to use
 
+``--nb_qps N``
+  number of QPs to use
+
+``--nb_lcores N``
+  number of cores to use
+
 ``--perf N``
   only outputs the performance data
 
@@ -70,4 +89,5 @@ The data file, will be used as a source data for the RegEx to work on.
 
 The tool has a number of command line options. Here is the sample command line::
 
-   ./dpdk-test-regex -a 83:00.0 -- --rules rule_file.rof2 --data data_file.txt --job 100
+   ./dpdk-test-regex -a 83:00.0 -- --rules rule_file.rof2 --data data_file.txt --job 100 \
+     --nb_qps 4 --nb_lcores 2
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 5/6] app/regex: support performance measurements per QP
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
                           ` (3 preceding siblings ...)
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 4/6] app/regex: support multi cores Ophir Munk
@ 2021-01-10 11:10         ` Ophir Munk
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
  2021-01-12 23:05         ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Thomas Monjalon
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

Up to this commit measuring the parsing elapsed time and Giga bits per
second performance was done on the aggregation of all QPs (per core).
This commit separates the time measurements per individual QP.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
---
 app/test-regex/main.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 2948d3e..2fce55d 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -48,6 +48,8 @@ struct qp_params {
 	struct rte_regex_ops **ops;
 	struct job_ctx *jobs_ctx;
 	char *buf;
+	time_t start;
+	time_t end;
 };
 
 struct qps_per_lcore {
@@ -324,8 +326,6 @@ run_regex(void *args)
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
 	int res = 0;
-	time_t start;
-	time_t end;
 	double time;
 	struct rte_mempool *mbuf_mp;
 	struct qp_params *qp;
@@ -418,9 +418,10 @@ run_regex(void *args)
 
 		qp->buf = buf;
 		qp->total_matches = 0;
+		qp->start = 0;
+		qp->end = 0;
 	}
 
-	start = clock();
 	for (i = 0; i < nb_iterations; i++) {
 		for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 			qp = &qps[qp_id];
@@ -431,6 +432,8 @@ run_regex(void *args)
 			update = false;
 			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 				qp = &qps[qp_id];
+				if (!qp->start)
+					qp->start = clock();
 				if (qp->total_dequeue < actual_jobs) {
 					struct rte_regex_ops **
 						cur_ops_to_enqueue = qp->ops +
@@ -461,22 +464,31 @@ run_regex(void *args)
 							qp->total_enqueue -
 							qp->total_dequeue);
 					update = true;
+				} else {
+					if (!qp->end)
+						qp->end = clock();
 				}
+
 			}
 		} while (update);
 	}
-	end = clock();
-	time = ((double)end - start) / CLOCKS_PER_SEC;
-	printf("Job len = %ld Bytes\n",  job_len);
-	printf("Time = %lf sec\n",  time);
-	printf("Perf = %lf Gbps\n",
-	       (((double)actual_jobs * job_len * nb_iterations * 8) / time) /
-		1000000000.0);
+	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
+		qp = &qps[qp_id];
+		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
+		printf("Core=%u QP=%u\n", rte_lcore_id(), qp_id + qp_id_base);
+		printf("Job len = %ld Bytes\n",  job_len);
+		printf("Time = %lf sec\n",  time);
+		printf("Perf = %lf Gbps\n\n",
+				(((double)actual_jobs * job_len *
+				nb_iterations * 8) / time) /
+				1000000000.0);
+	}
 
 	if (rgxc->perf_mode)
 		goto end;
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
-		printf("\n############ QP id=%u ############\n", qp_id);
+		printf("\n############ Core=%u QP=%u ############\n",
+		       rte_lcore_id(), qp_id + qp_id_base);
 		qp = &qps[qp_id];
 		/* Log results per job. */
 		for (d_ind = 0; d_ind < qp->total_dequeue; d_ind++) {
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [dpdk-dev] [PATCH v3 6/6] app/regex: replace Linux clock() API with rdtsc
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
                           ` (4 preceding siblings ...)
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 5/6] app/regex: support performance measurements per QP Ophir Munk
@ 2021-01-10 11:10         ` Ophir Munk
  2021-01-12 23:05         ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Thomas Monjalon
  6 siblings, 0 replies; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:10 UTC (permalink / raw)
  To: dev; +Cc: Ori Kam, Thomas Monjalon, Ophir Munk

Performance measurement (elapsed time and Gbps) are based on Linux
clock() API. The resolution is improved by replacing the clock() API
with rte_rdtsc_precise() API.

Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
---
 app/test-regex/main.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/app/test-regex/main.c b/app/test-regex/main.c
index 2fce55d..aea4fa6 100644
--- a/app/test-regex/main.c
+++ b/app/test-regex/main.c
@@ -48,8 +48,8 @@ struct qp_params {
 	struct rte_regex_ops **ops;
 	struct job_ctx *jobs_ctx;
 	char *buf;
-	time_t start;
-	time_t end;
+	uint64_t start;
+	uint64_t cycles;
 };
 
 struct qps_per_lcore {
@@ -326,7 +326,7 @@ run_regex(void *args)
 	unsigned long d_ind = 0;
 	struct rte_mbuf_ext_shared_info shinfo;
 	int res = 0;
-	double time;
+	long double time;
 	struct rte_mempool *mbuf_mp;
 	struct qp_params *qp;
 	struct qp_params *qps = NULL;
@@ -419,7 +419,7 @@ run_regex(void *args)
 		qp->buf = buf;
 		qp->total_matches = 0;
 		qp->start = 0;
-		qp->end = 0;
+		qp->cycles = 0;
 	}
 
 	for (i = 0; i < nb_iterations; i++) {
@@ -432,9 +432,8 @@ run_regex(void *args)
 			update = false;
 			for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 				qp = &qps[qp_id];
-				if (!qp->start)
-					qp->start = clock();
 				if (qp->total_dequeue < actual_jobs) {
+					qp->start = rte_rdtsc_precise();
 					struct rte_regex_ops **
 						cur_ops_to_enqueue = qp->ops +
 						qp->total_enqueue;
@@ -463,25 +462,21 @@ run_regex(void *args)
 							cur_ops_to_dequeue,
 							qp->total_enqueue -
 							qp->total_dequeue);
+					qp->cycles +=
+					     (rte_rdtsc_precise() - qp->start);
 					update = true;
-				} else {
-					if (!qp->end)
-						qp->end = clock();
 				}
-
 			}
 		} while (update);
 	}
 	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
 		qp = &qps[qp_id];
-		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
-		printf("Core=%u QP=%u\n", rte_lcore_id(), qp_id + qp_id_base);
-		printf("Job len = %ld Bytes\n",  job_len);
-		printf("Time = %lf sec\n",  time);
-		printf("Perf = %lf Gbps\n\n",
-				(((double)actual_jobs * job_len *
-				nb_iterations * 8) / time) /
-				1000000000.0);
+		time = (long double)qp->cycles / rte_get_timer_hz();
+		printf("Core=%u QP=%u Job=%ld Bytes Time=%Lf sec Perf=%Lf "
+		       "Gbps\n", rte_lcore_id(), qp_id + qp_id_base,
+		       job_len, time,
+		       (((double)actual_jobs * job_len * nb_iterations * 8)
+		       / time) / 1000000000.0);
 	}
 
 	if (rgxc->perf_mode)
-- 
2.8.4


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP
  2021-01-08  9:08       ` Thomas Monjalon
@ 2021-01-10 11:16         ` Ophir Munk
  2021-01-10 22:55           ` Thomas Monjalon
  0 siblings, 1 reply; 28+ messages in thread
From: Ophir Munk @ 2021-01-10 11:16 UTC (permalink / raw)
  To: NBU-Contact-Thomas Monjalon; +Cc: dev, Ori Kam



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Friday, January 8, 2021 11:09 AM
> To: Ophir Munk <ophirmu@nvidia.com>
> Cc: dev@dpdk.org; Ori Kam <orika@nvidia.com>
> Subject: Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance
> measurements per QP
> 
> 20/12/2020 11:41, Ophir Munk:
> > Up to this commit measuring the parsing elapsed time and Giga bits per
> > second performance was done on the aggregation of all QPs (per core).
> > This commit separates the time measurements per individual QP.
> >
> > Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
> > ---
> > --- a/app/test-regex/main.c
> > +++ b/app/test-regex/main.c
> > +	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
> > +		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
> 
> This line triggers an error with PPC compiler:
> error: ‘qp’ may be used uninitialized in this function [-Werror=maybe-
> uninitialized]
>    time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
> 
Thanks for reporting.
I sent v3 with a fix.
Could I have known this error in advance? Is there a CI report?



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP
  2021-01-10 11:16         ` Ophir Munk
@ 2021-01-10 22:55           ` Thomas Monjalon
  2021-01-11 19:07             ` David Christensen
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Monjalon @ 2021-01-10 22:55 UTC (permalink / raw)
  To: Ophir Munk; +Cc: dev, Ori Kam, David Christensen

10/01/2021 12:16, Ophir Munk:
> 
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Friday, January 8, 2021 11:09 AM
> > To: Ophir Munk <ophirmu@nvidia.com>
> > Cc: dev@dpdk.org; Ori Kam <orika@nvidia.com>
> > Subject: Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance
> > measurements per QP
> > 
> > 20/12/2020 11:41, Ophir Munk:
> > > Up to this commit measuring the parsing elapsed time and Giga bits per
> > > second performance was done on the aggregation of all QPs (per core).
> > > This commit separates the time measurements per individual QP.
> > >
> > > Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
> > > ---
> > > --- a/app/test-regex/main.c
> > > +++ b/app/test-regex/main.c
> > > +	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
> > > +		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
> > 
> > This line triggers an error with PPC compiler:
> > error: ‘qp’ may be used uninitialized in this function [-Werror=maybe-
> > uninitialized]
> >    time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
> > 
> Thanks for reporting.
> I sent v3 with a fix.
> Could I have known this error in advance? Is there a CI report?

No I think there is no CI report for PPC.
Actually the PPC support in DPDK is not clear.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP
  2021-01-10 22:55           ` Thomas Monjalon
@ 2021-01-11 19:07             ` David Christensen
  0 siblings, 0 replies; 28+ messages in thread
From: David Christensen @ 2021-01-11 19:07 UTC (permalink / raw)
  To: Thomas Monjalon, Ophir Munk; +Cc: dev, Ori Kam

>>> -----Original Message-----
>>> From: Thomas Monjalon <thomas@monjalon.net>
>>> Sent: Friday, January 8, 2021 11:09 AM
>>> To: Ophir Munk <ophirmu@nvidia.com>
>>> Cc: dev@dpdk.org; Ori Kam <orika@nvidia.com>
>>> Subject: Re: [dpdk-dev] [PATCH v2 5/6] app/regex: support performance
>>> measurements per QP
>>>
>>> 20/12/2020 11:41, Ophir Munk:
>>>> Up to this commit measuring the parsing elapsed time and Giga bits per
>>>> second performance was done on the aggregation of all QPs (per core).
>>>> This commit separates the time measurements per individual QP.
>>>>
>>>> Signed-off-by: Ophir Munk <ophirmu@nvidia.com>
>>>> ---
>>>> --- a/app/test-regex/main.c
>>>> +++ b/app/test-regex/main.c
>>>> +	for (qp_id = 0; qp_id < nb_qps; qp_id++) {
>>>> +		time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
>>>
>>> This line triggers an error with PPC compiler:
>>> error: ‘qp’ may be used uninitialized in this function [-Werror=maybe-
>>> uninitialized]
>>>     time = ((double)qp->end - qp->start) / CLOCKS_PER_SEC;
>>>
>> Thanks for reporting.
>> I sent v3 with a fix.
>> Could I have known this error in advance? Is there a CI report?
> 
> No I think there is no CI report for PPC.
> Actually the PPC support in DPDK is not clear.

IBM is still supporting DPDK on PPC, though we don't have a proper CI 
infrastructure in place.  PPC support is available from Travis but was 
unreliable when we last attempted to enable it.  I'll take Thomas' hint 
the hint and revisit the issue in 2021.

Dave

FYI, v2 of the patch built successfully on my PPC test system with gcc 
8.3.1.  There's a compiler difference between my RH8 environment and the 
cross-build environment Thomas references.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support
  2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
                           ` (5 preceding siblings ...)
  2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
@ 2021-01-12 23:05         ` Thomas Monjalon
  6 siblings, 0 replies; 28+ messages in thread
From: Thomas Monjalon @ 2021-01-12 23:05 UTC (permalink / raw)
  To: Ophir Munk; +Cc: Ori Kam, dev

> Ophir Munk (6):
>   app/regex: move mem pool creation to worker routine
>   app/regex: support multi QPs
>   app/regex: read data file once at startup
>   app/regex: support multi cores
>   app/regex: support performance measurements per QP
>   app/regex: replace Linux clock() API with rdtsc

Applied, thanks.



^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-01-12 23:05 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-16 16:49 [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ophir Munk
2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
2020-12-20 10:41   ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ophir Munk
2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
2021-01-10 11:10       ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Ophir Munk
2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 1/6] app/regex: move mem pool creation to worker routine Ophir Munk
2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 2/6] app/regex: support multi QPs Ophir Munk
2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 3/6] app/regex: read data file once at startup Ophir Munk
2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 4/6] app/regex: support multi cores Ophir Munk
2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 5/6] app/regex: support performance measurements per QP Ophir Munk
2021-01-10 11:10         ` [dpdk-dev] [PATCH v3 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
2021-01-12 23:05         ` [dpdk-dev] [PATCH v3 0/6] regex multi Q with multi cores support Thomas Monjalon
2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 2/6] app/regex: support multi QPs Ophir Munk
2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 3/6] app/regex: read data file once at startup Ophir Munk
2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 4/6] app/regex: support multi cores Ophir Munk
2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 5/6] app/regex: support performance measurements per QP Ophir Munk
2021-01-08  9:08       ` Thomas Monjalon
2021-01-10 11:16         ` Ophir Munk
2021-01-10 22:55           ` Thomas Monjalon
2021-01-11 19:07             ` David Christensen
2020-12-20 10:41     ` [dpdk-dev] [PATCH v2 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
2021-01-04 14:01     ` [dpdk-dev] [PATCH v2 0/6] regex multi Q with multi cores support Ori Kam
2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 2/6] app/regex: support multi QPs Ophir Munk
2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 3/6] app/regex: read data file once at startup Ophir Munk
2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 4/6] app/regex: support multi cores Ophir Munk
2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 5/6] app/regex: support performance measurements per QP Ophir Munk
2020-12-16 16:49 ` [dpdk-dev] [PATCH v1 6/6] app/regex: replace Linux clock() API with rdtsc Ophir Munk
2020-12-17 11:52 ` [dpdk-dev] [PATCH v1 0/6] regex multi Q with multi cores support Ori Kam

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git