* [PATCH v2] app/dma-perf: add SG copy support
@ 2023-08-10 10:57 Gowrishankar Muthukrishnan
  2023-08-10 13:01 ` [PATCH v3 0/2] " Gowrishankar Muthukrishnan
  0 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-08-10 10:57 UTC (permalink / raw)
  To: dev; +Cc: anoobj, Cheng Jiang, Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
v2:
 - patch issue fixed.
---
 app/test-dma-perf/benchmark.c | 204 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  17 +++
 app/test-dma-perf/main.c      |  35 +++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 231 insertions(+), 30 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9e5b5dc770..5f03f99b7b 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge **src_sges;
+	struct rte_dma_sge **dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -130,7 +144,7 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -153,6 +167,10 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -166,8 +184,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -177,7 +199,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, ring_size, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -217,7 +239,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -270,6 +292,61 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge **src_sges = para->src_sges;
+	struct rte_dma_sge **dst_sges = para->dst_sges;
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0, src_sges[i], dst_sges[i],
+								  src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -303,8 +380,9 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge ***src_sges, struct rte_dma_sge ***dst_sges)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets, i;
@@ -366,15 +444,69 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+		uint32_t nr_buf = cfg->nr_buf;
+		uint8_t j;
+
+		*src_sges = rte_malloc(NULL, nr_buf * sizeof(struct rte_dma_sge **), 0);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i] = rte_malloc(NULL, src_ptrs * sizeof(struct rte_dma_sge), 0);
+			if ((*src_sges)[i] == NULL) {
+				printf("Error: src_sges malloc failed.\n");
+				return -1;
+			}
+		}
+
+		*dst_sges = rte_malloc(NULL, nr_buf * sizeof(struct rte_dma_sge **), 0);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		for (i = 0; i < nr_buf; i++) {
+			(*dst_sges)[i] = rte_malloc(NULL, dst_ptrs * sizeof(struct rte_dma_sge), 0);
+			if ((*dst_sges)[i] == NULL) {
+				printf("Error: dst_sges malloc failed.\n");
+				return -1;
+			}
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+		for (i = 0; i < nr_buf; i++) {
+			for (j = 0; j < src_ptrs; j++) {
+				(*src_sges)[i][j].addr = rte_pktmbuf_iova((*srcs)[i]) +
+										sglen_src * j;
+				(*src_sges)[i][j].length = sglen_src;
+			}
+			(*src_sges)[i][j-1].length += buf_size % src_ptrs;
+
+			for (j = 0; j < dst_ptrs; j++) {
+				(*dst_sges)[i][j].addr = rte_pktmbuf_iova((*dsts)[i]) +
+										sglen_dst * j;
+				(*dst_sges)[i][j].length = sglen_dst;
+			}
+			(*dst_sges)[i][j-1].length += buf_size % dst_ptrs;
+		}
+	}
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
 	uint16_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
+	struct rte_dma_sge **src_sges = NULL, **dst_sges = NULL;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
@@ -389,10 +521,10 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	float bandwidth, bandwidth_total;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -412,7 +544,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -426,10 +558,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + offset * cfg->src_ptrs;
+			lcores[i]->dst_sges = dst_sges + offset * cfg->dst_ptrs;
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -478,10 +623,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -510,13 +653,24 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	for (i = 0; i < nr_buf; i++) {
+		rte_free(src_sges[i]);
+		rte_free(dst_sges[i]);
+	}
+
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..f1b268a384 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -50,6 +52,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index f917be4216..f1779a166b 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -93,10 +93,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -325,7 +323,8 @@ load_configs(const char *path)
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -361,12 +360,14 @@ load_configs(const char *path)
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
 			test_case->test_type_str = CPU_MEM_COPY;
 			is_dma = false;
+
 		} else {
 			printf("Error: Wrong test case type %s in case%d.\n", case_type, i + 1);
 			test_case->is_valid = false;
 			continue;
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -401,6 +402,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 658f22f673..b240bb5497 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -47,11 +47,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -59,6 +62,6 @@ struct test_configure {
 	uint8_t scenario_id;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v3 0/2] app/dma-perf: add SG copy support
  2023-08-10 10:57 [PATCH v2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-08-10 13:01 ` Gowrishankar Muthukrishnan
  2023-08-10 13:01   ` [PATCH v3 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                     ` (3 more replies)
  0 siblings, 4 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-08-10 13:01 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Gowrishankar Muthukrishnan
Add SG copy support in dma-perf application.
v3:
 - Combined patch that does copy validation along with
   this patch, which means better validation for SG.
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 227 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  17 +++
 app/test-dma-perf/main.c      |  47 +++++--
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 261 insertions(+), 35 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v3 1/2] app/dma-perf: validate copied memory
  2023-08-10 13:01 ` [PATCH v3 0/2] " Gowrishankar Muthukrishnan
@ 2023-08-10 13:01   ` Gowrishankar Muthukrishnan
  2023-08-23 11:46     ` [EXT] " Pavan Nikhilesh Bhagavatula
  2023-08-10 13:01   ` [PATCH v3 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-08-10 13:01 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Fixes: 623dc9364dc ("app/dma-perf: introduce DMA performance test")
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
 app/test-dma-perf/benchmark.c | 23 +++++++++++++++++++++--
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0601e0d171..9e5b5dc770 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -306,7 +307,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
-	unsigned int nr_sockets;
+	unsigned int nr_sockets, i;
 	uint32_t nr_buf = cfg->nr_buf;
 
 	nr_sockets = rte_socket_count();
@@ -360,10 +361,15 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint16_t i;
@@ -381,6 +387,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -454,6 +461,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < cfg->nr_buf; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -505,4 +522,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e5bccc27da..f917be4216 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -86,20 +86,24 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -144,8 +148,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index f65e264378..658f22f673 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -59,6 +59,6 @@ struct test_configure {
 	uint8_t scenario_id;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v3 2/2] app/dma-perf: add SG copy support
  2023-08-10 13:01 ` [PATCH v3 0/2] " Gowrishankar Muthukrishnan
  2023-08-10 13:01   ` [PATCH v3 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-08-10 13:01   ` Gowrishankar Muthukrishnan
  2023-09-21  3:02   ` [PATCH v3 0/2] " Jiang, Cheng1
  2023-09-24  9:32   ` [PATCH v4 " Gowrishankar Muthukrishnan
  3 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-08-10 13:01 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
 app/test-dma-perf/benchmark.c | 204 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  17 +++
 app/test-dma-perf/main.c      |  35 +++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 231 insertions(+), 30 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9e5b5dc770..5f03f99b7b 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge **src_sges;
+	struct rte_dma_sge **dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -130,7 +144,7 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -153,6 +167,10 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -166,8 +184,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -177,7 +199,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, ring_size, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -217,7 +239,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -270,6 +292,61 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge **src_sges = para->src_sges;
+	struct rte_dma_sge **dst_sges = para->dst_sges;
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0, src_sges[i], dst_sges[i],
+								  src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -303,8 +380,9 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge ***src_sges, struct rte_dma_sge ***dst_sges)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets, i;
@@ -366,15 +444,69 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+		uint32_t nr_buf = cfg->nr_buf;
+		uint8_t j;
+
+		*src_sges = rte_malloc(NULL, nr_buf * sizeof(struct rte_dma_sge **), 0);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i] = rte_malloc(NULL, src_ptrs * sizeof(struct rte_dma_sge), 0);
+			if ((*src_sges)[i] == NULL) {
+				printf("Error: src_sges malloc failed.\n");
+				return -1;
+			}
+		}
+
+		*dst_sges = rte_malloc(NULL, nr_buf * sizeof(struct rte_dma_sge **), 0);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		for (i = 0; i < nr_buf; i++) {
+			(*dst_sges)[i] = rte_malloc(NULL, dst_ptrs * sizeof(struct rte_dma_sge), 0);
+			if ((*dst_sges)[i] == NULL) {
+				printf("Error: dst_sges malloc failed.\n");
+				return -1;
+			}
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+		for (i = 0; i < nr_buf; i++) {
+			for (j = 0; j < src_ptrs; j++) {
+				(*src_sges)[i][j].addr = rte_pktmbuf_iova((*srcs)[i]) +
+										sglen_src * j;
+				(*src_sges)[i][j].length = sglen_src;
+			}
+			(*src_sges)[i][j-1].length += buf_size % src_ptrs;
+
+			for (j = 0; j < dst_ptrs; j++) {
+				(*dst_sges)[i][j].addr = rte_pktmbuf_iova((*dsts)[i]) +
+										sglen_dst * j;
+				(*dst_sges)[i][j].length = sglen_dst;
+			}
+			(*dst_sges)[i][j-1].length += buf_size % dst_ptrs;
+		}
+	}
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
 	uint16_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
+	struct rte_dma_sge **src_sges = NULL, **dst_sges = NULL;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
@@ -389,10 +521,10 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	float bandwidth, bandwidth_total;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -412,7 +544,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -426,10 +558,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + offset * cfg->src_ptrs;
+			lcores[i]->dst_sges = dst_sges + offset * cfg->dst_ptrs;
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -478,10 +623,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -510,13 +653,24 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	for (i = 0; i < nr_buf; i++) {
+		rte_free(src_sges[i]);
+		rte_free(dst_sges[i]);
+	}
+
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..f1b268a384 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -50,6 +52,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index f917be4216..f1779a166b 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -93,10 +93,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -325,7 +323,8 @@ load_configs(const char *path)
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -361,12 +360,14 @@ load_configs(const char *path)
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
 			test_case->test_type_str = CPU_MEM_COPY;
 			is_dma = false;
+
 		} else {
 			printf("Error: Wrong test case type %s in case%d.\n", case_type, i + 1);
 			test_case->is_valid = false;
 			continue;
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -401,6 +402,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 658f22f673..b240bb5497 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -47,11 +47,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -59,6 +62,6 @@ struct test_configure {
 	uint8_t scenario_id;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXT] [PATCH v3 1/2] app/dma-perf: validate copied memory
  2023-08-10 13:01   ` [PATCH v3 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-08-23 11:46     ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 79+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-08-23 11:46 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Gowrishankar Muthukrishnan
> Validate copied memory to ensure DMA copy did not fail.
> 
> Fixes: 623dc9364dc ("app/dma-perf: introduce DMA performance test")
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 23 +++++++++++++++++++++--
>  app/test-dma-perf/main.c      | 16 +++++++++++-----
>  app/test-dma-perf/main.h      |  2 +-
>  3 files changed, 33 insertions(+), 8 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> perf/benchmark.c
> index 0601e0d171..9e5b5dc770 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -12,6 +12,7 @@
>  #include <rte_dmadev.h>
>  #include <rte_malloc.h>
>  #include <rte_lcore.h>
> +#include <rte_random.h>
> 
>  #include "main.h"
> 
> @@ -306,7 +307,7 @@ setup_memory_env(struct test_configure *cfg,
> struct rte_mbuf ***srcs,
>  			struct rte_mbuf ***dsts)
>  {
>  	unsigned int buf_size = cfg->buf_size.cur;
> -	unsigned int nr_sockets;
> +	unsigned int nr_sockets, i;
>  	uint32_t nr_buf = cfg->nr_buf;
> 
>  	nr_sockets = rte_socket_count();
> @@ -360,10 +361,15 @@ setup_memory_env(struct test_configure *cfg,
> struct rte_mbuf ***srcs,
>  		return -1;
>  	}
> 
> +	for (i = 0; i < nr_buf; i++) {
> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
> buf_size);
> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> +	}
> +
>  	return 0;
>  }
> 
> -void
> +int
>  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  {
>  	uint16_t i;
> @@ -381,6 +387,7 @@ mem_copy_benchmark(struct test_configure *cfg,
> bool is_dma)
>  	uint32_t avg_cycles_total;
>  	float mops, mops_total;
>  	float bandwidth, bandwidth_total;
> +	int ret = 0;
> 
>  	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
>  		goto out;
> @@ -454,6 +461,16 @@ mem_copy_benchmark(struct test_configure *cfg,
> bool is_dma)
> 
>  	rte_eal_mp_wait_lcore();
> 
> +	for (i = 0; i < cfg->nr_buf; i++) {
Here i is uint16_t which will overflow when memsize is set to 10 in the default config.ini, please increase it to
32-bit.
> +		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> +			   rte_pktmbuf_mtod(dsts[i], void *),
> +			   cfg->buf_size.cur) != 0) {
> +			printf("Copy validation fails for buffer number %d\n",
> i);
> +			ret = -1;
> +			goto out;
> +		}
> +	}
> +
>  	mops_total = 0;
>  	bandwidth_total = 0;
>  	avg_cycles_total = 0;
> @@ -505,4 +522,6 @@ mem_copy_benchmark(struct test_configure *cfg,
> bool is_dma)
>  			rte_dma_stop(ldm->dma_ids[i]);
>  		}
>  	}
> +
> +	return ret;
>  }
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index e5bccc27da..f917be4216 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -86,20 +86,24 @@ output_header(uint32_t case_id, struct
> test_configure *case_cfg)
>  	output_csv(true);
>  }
> 
> -static void
> +static int
>  run_test_case(struct test_configure *case_cfg)
>  {
> +	int ret = 0;
> +
>  	switch (case_cfg->test_type) {
>  	case TEST_TYPE_DMA_MEM_COPY:
> -		mem_copy_benchmark(case_cfg, true);
> +		ret = mem_copy_benchmark(case_cfg, true);
>  		break;
>  	case TEST_TYPE_CPU_MEM_COPY:
> -		mem_copy_benchmark(case_cfg, false);
> +		ret = mem_copy_benchmark(case_cfg, false);
>  		break;
>  	default:
>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
>  		break;
>  	}
> +
> +	return ret;
>  }
> 
>  static void
> @@ -144,8 +148,10 @@ run_test(uint32_t case_id, struct test_configure
> *case_cfg)
>  		case_cfg->scenario_id++;
>  		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
> 
> -		run_test_case(case_cfg);
> -		output_csv(false);
> +		if (run_test_case(case_cfg) < 0)
> +			printf("\nTest fails! skipping this scenario.\n");
> +		else
> +			output_csv(false);
> 
>  		if (var_entry->op == OP_ADD)
>  			var_entry->cur += var_entry->incr;
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index f65e264378..658f22f673 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -59,6 +59,6 @@ struct test_configure {
>  	uint8_t scenario_id;
>  };
> 
> -void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> +int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> 
>  #endif /* MAIN_H */
> --
> 2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v3 0/2] app/dma-perf: add SG copy support
  2023-08-10 13:01 ` [PATCH v3 0/2] " Gowrishankar Muthukrishnan
  2023-08-10 13:01   ` [PATCH v3 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
  2023-08-10 13:01   ` [PATCH v3 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-09-21  3:02   ` Jiang, Cheng1
  2023-09-24  9:32   ` [PATCH v4 " Gowrishankar Muthukrishnan
  3 siblings, 0 replies; 79+ messages in thread
From: Jiang, Cheng1 @ 2023-09-21  3:02 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev; +Cc: anoobj, Laatz, Kevin, Richardson, Bruce
Hi,
Sorry for the late reply.
Generally I like your idea. But I might not be able to review in a timely manner, as I'm leaving Intel.
If you're in a hurry, you might need to find someone else to help with the review.
Thanks for your work,
Cheng
> -----Original Message-----
> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Sent: Thursday, August 10, 2023 9:02 PM
> To: dev@dpdk.org
> Cc: anoobj@marvell.com; Jiang, Cheng1 <cheng1.jiang@intel.com>; Laatz,
> Kevin <kevin.laatz@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>
> Subject: [PATCH v3 0/2] app/dma-perf: add SG copy support
> 
> Add SG copy support in dma-perf application.
> 
> v3:
>  - Combined patch that does copy validation along with
>    this patch, which means better validation for SG.
> 
> Gowrishankar Muthukrishnan (2):
>   app/dma-perf: validate copied memory
>   app/dma-perf: add SG copy support
> 
>  app/test-dma-perf/benchmark.c | 227
> ++++++++++++++++++++++++++++++----
>  app/test-dma-perf/config.ini  |  17 +++
>  app/test-dma-perf/main.c      |  47 +++++--
>  app/test-dma-perf/main.h      |   5 +-
>  4 files changed, 261 insertions(+), 35 deletions(-)
> 
> --
> 2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v4 0/2] app/dma-perf: add SG copy support
  2023-08-10 13:01 ` [PATCH v3 0/2] " Gowrishankar Muthukrishnan
                     ` (2 preceding siblings ...)
  2023-09-21  3:02   ` [PATCH v3 0/2] " Jiang, Cheng1
@ 2023-09-24  9:32   ` Gowrishankar Muthukrishnan
  2023-09-24  9:32     ` [PATCH v4 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                       ` (2 more replies)
  3 siblings, 3 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-09-24  9:32 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
Add SG copy support in dma-perf application.
v4:
 - improvement in buffer allocation for perf validation.
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 251 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  17 +++
 app/test-dma-perf/main.c      |  46 ++++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 283 insertions(+), 36 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v4 1/2] app/dma-perf: validate copied memory
  2023-09-24  9:32   ` [PATCH v4 " Gowrishankar Muthukrishnan
@ 2023-09-24  9:32     ` Gowrishankar Muthukrishnan
  2023-09-24  9:32     ` [PATCH v4 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
  2 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-09-24  9:32 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Fixes: 623dc9364dc ("app/dma-perf: introduce DMA performance test")
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
 app/test-dma-perf/benchmark.c | 24 ++++++++++++++++++++++--
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0601e0d171..99e05436b9 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -308,6 +309,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -360,13 +362,18 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL;
@@ -381,6 +388,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -454,6 +462,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -505,4 +523,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e5bccc27da..f917be4216 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -86,20 +86,24 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -144,8 +148,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index f65e264378..658f22f673 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -59,6 +59,6 @@ struct test_configure {
 	uint8_t scenario_id;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v4 2/2] app/dma-perf: add SG copy support
  2023-09-24  9:32   ` [PATCH v4 " Gowrishankar Muthukrishnan
  2023-09-24  9:32     ` [PATCH v4 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-09-24  9:32     ` Gowrishankar Muthukrishnan
  2023-09-28 21:12       ` Pavan Nikhilesh Bhagavatula
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
  2 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-09-24  9:32 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
 app/test-dma-perf/benchmark.c | 243 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  17 +++
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 260 insertions(+), 39 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 99e05436b9..82eb2cebe9 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -130,7 +144,7 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -153,6 +167,10 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -166,8 +184,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -177,7 +199,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, ring_size, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -217,7 +239,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -270,6 +292,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf = nr_buf / src_ptrs;
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -303,8 +384,9 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
@@ -367,20 +449,50 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -388,12 +500,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf = (nr_buf / nb_workers) * nb_workers;
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf = (nr_buf / cfg->src_ptrs) * cfg->src_ptrs;
+		nr_buf *= nb_workers;
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -413,7 +536,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -427,10 +550,24 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + offset;
+			lcores[i]->dst_sges = dst_sges +
+				((nr_buf / cfg->src_ptrs * cfg->dst_ptrs) / nb_workers) * i;
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -462,13 +599,46 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else {
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf;
+
+		for (i = 0; i < (nr_buf / cfg->src_ptrs); i++) {
+			sbuf = src;
+			dbuf = dst;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				memcpy(sbuf,
+					rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *),
+					src_sz);
+				sbuf += src_sz;
+			}
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				memcpy(dbuf,
+					rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *),
+					dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -479,10 +649,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -511,13 +679,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..cd36b1f288 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -49,6 +51,21 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
+[case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
 [case2]
 type=CPU_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index f917be4216..ece896193b 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -93,10 +93,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -325,7 +323,8 @@ load_configs(const char *path)
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -367,6 +366,7 @@ load_configs(const char *path)
 			continue;
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -401,6 +401,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 658f22f673..b240bb5497 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -47,11 +47,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -59,6 +62,6 @@ struct test_configure {
 	uint8_t scenario_id;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v4 2/2] app/dma-perf: add SG copy support
  2023-09-24  9:32     ` [PATCH v4 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-09-28 21:12       ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 79+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-09-28 21:12 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Gowrishankar Muthukrishnan
> -----Original Message-----
> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Sent: Sunday, September 24, 2023 3:02 PM
> To: dev@dpdk.org
> Cc: Anoob Joseph <anoobj@marvell.com>; Cheng Jiang
> <cheng1.jiang@intel.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>
> Subject: [PATCH v4 2/2] app/dma-perf: add SG copy support
> 
> Add SG copy support.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 243
> +++++++++++++++++++++++++++++-----
>  app/test-dma-perf/config.ini  |  17 +++
>  app/test-dma-perf/main.c      |  34 ++++-
>  app/test-dma-perf/main.h      |   5 +-
>  4 files changed, 260 insertions(+), 39 deletions(-)
> 
<snip>
> @@ -462,13 +599,46 @@ mem_copy_benchmark(struct test_configure *cfg,
> bool is_dma)
> 
>  	rte_eal_mp_wait_lcore();
> 
> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> -			   rte_pktmbuf_mtod(dsts[i], void *),
> -			   cfg->buf_size.cur) != 0) {
> -			printf("Copy validation fails for buffer number %d\n",
> i);
> -			ret = -1;
> -			goto out;
> +	if (!cfg->is_sg) {
> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> +					rte_pktmbuf_mtod(dsts[i], void *),
> +					cfg->buf_size.cur) != 0) {
> +				printf("Copy validation fails for buffer
> number %d\n", i);
> +				ret = -1;
> +				goto out;
> +			}
> +		}
> +	} else {
> +		size_t src_sz = buf_size / cfg->src_ptrs;
> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> +		uint8_t src[buf_size], dst[buf_size];
> +		uint8_t *sbuf, *dbuf;
> +
> +		for (i = 0; i < (nr_buf / cfg->src_ptrs); i++) {
> +			sbuf = src;
> +			dbuf = dst;
> +
> +			for (j = 0; j < cfg->src_ptrs; j++) {
> +				memcpy(sbuf,
> +					rte_pktmbuf_mtod(srcs[i * cfg-
> >src_ptrs + j], uint8_t *),
> +					src_sz);
> +				sbuf += src_sz;
> +			}
> +
> +			for (j = 0; j < cfg->dst_ptrs; j++) {
> +				memcpy(dbuf,
> +					rte_pktmbuf_mtod(dsts[i * cfg-
> >dst_ptrs + j], uint8_t *),
> +					dst_sz);
> +				dbuf += dst_sz;
> +			}
> +
> +			if (memcmp(src, dst, buf_size) != 0) {
buf_size should be normalized to actual buffer size sent to the DMA engine, 
i.e., either cfg->src_ptrs * src_sz or cfg->dst_ptrs * dst_sz
Example, with 15 source segments and 15 destination segments and 64B buf_size the actual
buffer size copied by the DMA engine would be 60 bytes (64B / 15 = 4B per segment).
> +				printf("SG Copy validation fails for buffer
> number %d\n",
> +					i * cfg->src_ptrs);
> +				ret = -1;
> +				goto out;
> +			}
>  		}
>  	}
> 
<snip>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v5 0/4] app/dma-perf: PCI Dev and SG copy support
  2023-09-24  9:32   ` [PATCH v4 " Gowrishankar Muthukrishnan
  2023-09-24  9:32     ` [PATCH v4 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
  2023-09-24  9:32     ` [PATCH v4 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-10-26 18:31     ` Gowrishankar Muthukrishnan
  2023-10-26 18:31       ` [PATCH v5 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
                         ` (4 more replies)
  2 siblings, 5 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-10-26 18:31 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports below:
 - validate copied memory
 - skip tests if not opted.
v5:
 - Additional patches included to apply as one series.
Amit Prakash Shukla (2):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 344 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  56 ++++++
 app/test-dma-perf/main.c      | 136 +++++++++++++-
 app/test-dma-perf/main.h      |  12 +-
 4 files changed, 507 insertions(+), 41 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v5 1/4] app/dma-perf: add skip support
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
@ 2023-10-26 18:31       ` Gowrishankar Muthukrishnan
  2023-11-10  9:03         ` Anoob Joseph
  2023-10-26 18:31       ` [PATCH v5 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
                         ` (3 subsequent siblings)
  4 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-10-26 18:31 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
---
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 23 +++++++++++++++++++++++
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 26 insertions(+)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..4d59234b2a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -36,6 +36,8 @@
 ; If you do not specify a result file, one will be generated with the same name as the configuration
 ; file, with the addition of "_result.csv" at the end.
 
+; "skip" To skip a test-case set skip to 1.
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e5bccc27da..61260fa072 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -320,6 +320,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -339,6 +340,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -523,6 +531,21 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
 		if (!test_cases[i].is_valid) {
 			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index f65e264378..be89cb2b65 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -41,6 +41,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v5 2/4] app/dma-perf: add PCI device support
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
  2023-10-26 18:31       ` [PATCH v5 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2023-10-26 18:31       ` Gowrishankar Muthukrishnan
  2023-11-10  9:04         ` Anoob Joseph
  2023-10-26 18:31       ` [PATCH v5 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                         ` (2 subsequent siblings)
  4 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-10-26 18:31 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
---
 app/test-dma-perf/benchmark.c | 67 +++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  | 37 +++++++++++++++++++
 app/test-dma-perf/main.c      | 67 +++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  6 ++++
 4 files changed, 170 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0601e0d171..523f2fbb5a 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->dcoreid;
+		qconf->dst_port.pcie.vfid = cfg->vfid;
+		qconf->dst_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->scoreid;
+		qconf->src_port.pcie.vfid = cfg->vfid;
+		qconf->src_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -308,6 +344,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -360,6 +397,22 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_mbuf_iova_set(*srcs[i], (rte_iova_t)cfg->raddr);
+			((*srcs)[i])->data_off = 0;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_mbuf_iova_set(*dsts[i], (rte_iova_t)cfg->raddr);
+			((*dsts)[i])->data_off = 0;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 4d59234b2a..cddcf93c6e 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,23 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
+; ==================================================================================
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    0 - mem to mem transfer
+;    1 - mem to dev transfer
+;    2 - dev to mem transfer
+; If not specified the default value is 0 (mem to mem transfer).
+
+; "raddr" remote iova address for "mem to dev" and "dev to mem" transfer.
+
+; "scoreid" denotes source PCIe core index.
+; "dcoreid" denotes destination PCIe core index.
+; "pfid" denotes PF-id to be used for data transfer
+; "vfid" denotes VF-id of PF-id to be used for data transfer.
+
+; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
@@ -52,6 +69,26 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=2
+raddr=0x200000000
+scoreid=0
+dcoreid=0
+pfid=0
+vfid=0
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 61260fa072..9640356592 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,7 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
 
 #include "main.h"
 
@@ -318,9 +319,11 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -358,6 +361,20 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else
+				test_case->transfer_dir = (uint8_t)atoi(transfer_dir);
+
+			if (test_case->transfer_dir >= RTE_DMA_DIR_DEV_TO_DEV) {
+				printf("Error: Invalid transfer direction configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -369,6 +386,56 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			char *endptr;
+
+			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
+			if (raddr == NULL) {
+				printf("Error: No raddr configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->raddr = strtoull(raddr, &endptr, 16);
+
+			vfid = rte_cfgfile_get_entry(cfgfile, section_name, "vfid");
+			if (vfid == NULL) {
+				printf("Error: No vfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->vfid = (uint16_t)atoi(vfid);
+
+			pfid = rte_cfgfile_get_entry(cfgfile, section_name, "pfid");
+			if (pfid == NULL) {
+				printf("Error: No pfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->pfid = (uint8_t)atoi(pfid);
+
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
+			if (scoreid == NULL) {
+				printf("Error: No scoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->scoreid = (uint8_t)atoi(scoreid);
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
+			if (dcoreid == NULL) {
+				printf("Error: No dcoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->dcoreid = (uint8_t)atoi(dcoreid);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index be89cb2b65..617f62f085 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -43,6 +43,7 @@ struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -58,6 +59,11 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	uint8_t scoreid;
+	uint8_t dcoreid;
+	uint8_t pfid;
+	uint16_t vfid;
+	uint64_t raddr;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v5 3/4] app/dma-perf: validate copied memory
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
  2023-10-26 18:31       ` [PATCH v5 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
  2023-10-26 18:31       ` [PATCH v5 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2023-10-26 18:31       ` Gowrishankar Muthukrishnan
  2023-11-10  9:05         ` Anoob Joseph
  2023-10-26 18:31       ` [PATCH v5 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  4 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-10-26 18:31 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
 app/test-dma-perf/benchmark.c | 23 +++++++++++++++++++++--
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 33 insertions(+), 8 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 523f2fbb5a..c31f1aba93 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -397,6 +398,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
 		for (i = 0; i < nr_buf; i++) {
 			/* Using mbuf structure to hold remote iova address. */
@@ -416,10 +422,10 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL;
@@ -434,6 +440,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -507,6 +514,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -558,4 +575,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 9640356592..3b79694137 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -87,20 +87,24 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -145,8 +149,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 617f62f085..3d75edd1de 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -66,6 +66,6 @@ struct test_configure {
 	uint64_t raddr;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v5 4/4] app/dma-perf: add SG copy support
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
                         ` (2 preceding siblings ...)
  2023-10-26 18:31       ` [PATCH v5 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-10-26 18:31       ` Gowrishankar Muthukrishnan
  2023-11-10  9:07         ` Anoob Joseph
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  4 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-10-26 18:31 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
---
 app/test-dma-perf/benchmark.c | 274 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  19 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 292 insertions(+), 40 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index c31f1aba93..b363d28f15 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -339,8 +420,9 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
@@ -419,20 +501,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -440,12 +558,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -459,13 +597,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -479,10 +627,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -514,13 +675,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -531,10 +732,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -563,13 +762,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index cddcf93c6e..f460b93414 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -69,6 +71,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=2
@@ -88,7 +105,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 3b79694137..36c9594f8c 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -94,10 +94,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -327,7 +325,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
@@ -442,6 +441,7 @@ load_configs(const char *path)
 			test_case->dcoreid = (uint8_t)atoi(dcoreid);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -476,6 +476,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 3d75edd1de..56e0c77e25 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -49,11 +49,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -66,6 +69,6 @@ struct test_configure {
 	uint64_t raddr;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v5 1/4] app/dma-perf: add skip support
  2023-10-26 18:31       ` [PATCH v5 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2023-11-10  9:03         ` Anoob Joseph
  0 siblings, 0 replies; 79+ messages in thread
From: Anoob Joseph @ 2023-11-10  9:03 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev, Thomas Monjalon
  Cc: Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla
> Subject: [PATCH v5 1/4] app/dma-perf: add skip support
> 
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> 
> Add support to skip running a dma-perf test-case.
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v5 2/4] app/dma-perf: add PCI device support
  2023-10-26 18:31       ` [PATCH v5 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2023-11-10  9:04         ` Anoob Joseph
  0 siblings, 0 replies; 79+ messages in thread
From: Anoob Joseph @ 2023-11-10  9:04 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev, Thomas Monjalon
  Cc: Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla
> Subject: [PATCH v5 2/4] app/dma-perf: add PCI device support
> 
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> 
> Add support to test performance for "device to memory" and "memory to
> device" data transfer.
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v5 3/4] app/dma-perf: validate copied memory
  2023-10-26 18:31       ` [PATCH v5 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-11-10  9:05         ` Anoob Joseph
  0 siblings, 0 replies; 79+ messages in thread
From: Anoob Joseph @ 2023-11-10  9:05 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla,
	Thomas Monjalon, Gowrishankar Muthukrishnan
> Subject: [PATCH v5 3/4] app/dma-perf: validate copied memory
> 
> Validate copied memory to ensure DMA copy did not fail.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v5 4/4] app/dma-perf: add SG copy support
  2023-10-26 18:31       ` [PATCH v5 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-11-10  9:07         ` Anoob Joseph
  0 siblings, 0 replies; 79+ messages in thread
From: Anoob Joseph @ 2023-11-10  9:07 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla,
	Gowrishankar Muthukrishnan, Thomas Monjalon
> 
> Add SG copy support.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v6 0/4] PCI Dev and SG copy support
  2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
                         ` (3 preceding siblings ...)
  2023-10-26 18:31       ` [PATCH v5 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-11-13  4:41       ` Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
                           ` (4 more replies)
  4 siblings, 5 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-13  4:41 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports below:
 - validate copied memory
 - skip tests if not opted.
v6:
 - PCI patch updated.
Amit Prakash Shukla (2):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 383 +++++++++++++++++++++++++++++++---
 app/test-dma-perf/config.ini  |  56 +++++
 app/test-dma-perf/main.c      | 136 +++++++++++-
 app/test-dma-perf/main.h      |  12 +-
 4 files changed, 545 insertions(+), 42 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v6 1/4] app/dma-perf: add skip support
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
@ 2023-11-13  4:41         ` Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-13  4:41 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 23 +++++++++++++++++++++++
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 26 insertions(+)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..4d59234b2a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -36,6 +36,8 @@
 ; If you do not specify a result file, one will be generated with the same name as the configuration
 ; file, with the addition of "_result.csv" at the end.
 
+; "skip" To skip a test-case set skip to 1.
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e5bccc27da..61260fa072 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -320,6 +320,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -339,6 +340,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -523,6 +531,21 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
 		if (!test_cases[i].is_valid) {
 			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index f65e264378..be89cb2b65 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -41,6 +41,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v6 2/4] app/dma-perf: add PCI device support
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2023-11-13  4:41         ` Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-13  4:41 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
v6:
 - fix mbuf IOVA with remote address
---
---
 app/test-dma-perf/benchmark.c | 108 +++++++++++++++++++++++++++++++---
 app/test-dma-perf/config.ini  |  37 ++++++++++++
 app/test-dma-perf/main.c      |  67 +++++++++++++++++++++
 app/test-dma-perf/main.h      |   6 ++
 4 files changed, 209 insertions(+), 9 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0601e0d171..501e21981f 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->dcoreid;
+		qconf->dst_port.pcie.vfid = cfg->vfid;
+		qconf->dst_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->scoreid;
+		qconf->src_port.pcie.vfid = cfg->vfid;
+		qconf->src_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -301,13 +337,22 @@ do_cpu_mem_copy(void *p)
 	return 0;
 }
 
+static void
+dummy_free_ext_buf(void *addr, void *opaque)
+{
+	RTE_SET_USED(addr);
+	RTE_SET_USED(opaque);
+}
+
 static int
 setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -360,16 +405,47 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
+	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+		if (ext_buf_info == NULL) {
+			printf("Error: ext_buf_info malloc failed.\n");
+			return -1;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)cfg->raddr,
+						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)cfg->raddr,
+						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
 	return 0;
 }
 
 void
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -475,6 +551,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
 
 out:
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+		m = srcs;
+	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
+		m = dsts;
+
+	if (m) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_detach_extbuf(m[i]);
+
+		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+			rte_free(m[0]->shinfo);
+	}
+
 	/* free mbufs used in the test */
 	if (srcs != NULL)
 		rte_pktmbuf_free_bulk(srcs, nr_buf);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 4d59234b2a..cddcf93c6e 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,23 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
+; ==================================================================================
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    0 - mem to mem transfer
+;    1 - mem to dev transfer
+;    2 - dev to mem transfer
+; If not specified the default value is 0 (mem to mem transfer).
+
+; "raddr" remote iova address for "mem to dev" and "dev to mem" transfer.
+
+; "scoreid" denotes source PCIe core index.
+; "dcoreid" denotes destination PCIe core index.
+; "pfid" denotes PF-id to be used for data transfer
+; "vfid" denotes VF-id of PF-id to be used for data transfer.
+
+; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
@@ -52,6 +69,26 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=2
+raddr=0x200000000
+scoreid=0
+dcoreid=0
+pfid=0
+vfid=0
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 61260fa072..9640356592 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,7 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
 
 #include "main.h"
 
@@ -318,9 +319,11 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -358,6 +361,20 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else
+				test_case->transfer_dir = (uint8_t)atoi(transfer_dir);
+
+			if (test_case->transfer_dir >= RTE_DMA_DIR_DEV_TO_DEV) {
+				printf("Error: Invalid transfer direction configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -369,6 +386,56 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			char *endptr;
+
+			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
+			if (raddr == NULL) {
+				printf("Error: No raddr configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->raddr = strtoull(raddr, &endptr, 16);
+
+			vfid = rte_cfgfile_get_entry(cfgfile, section_name, "vfid");
+			if (vfid == NULL) {
+				printf("Error: No vfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->vfid = (uint16_t)atoi(vfid);
+
+			pfid = rte_cfgfile_get_entry(cfgfile, section_name, "pfid");
+			if (pfid == NULL) {
+				printf("Error: No pfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->pfid = (uint8_t)atoi(pfid);
+
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
+			if (scoreid == NULL) {
+				printf("Error: No scoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->scoreid = (uint8_t)atoi(scoreid);
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
+			if (dcoreid == NULL) {
+				printf("Error: No dcoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->dcoreid = (uint8_t)atoi(dcoreid);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index be89cb2b65..617f62f085 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -43,6 +43,7 @@ struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -58,6 +59,11 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	uint8_t scoreid;
+	uint8_t dcoreid;
+	uint8_t pfid;
+	uint16_t vfid;
+	uint64_t raddr;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v6 3/4] app/dma-perf: validate copied memory
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2023-11-13  4:41         ` Gowrishankar Muthukrishnan
  2023-11-13  4:41         ` [PATCH v6 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-13  4:41 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 501e21981f..bd1b38be60 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -405,6 +406,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
 	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
 		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
@@ -439,7 +445,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint32_t i;
@@ -457,6 +463,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -530,6 +537,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -595,4 +612,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 9640356592..3b79694137 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -87,20 +87,24 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -145,8 +149,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 617f62f085..3d75edd1de 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -66,6 +66,6 @@ struct test_configure {
 	uint64_t raddr;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v6 4/4] app/dma-perf: add SG copy support
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                           ` (2 preceding siblings ...)
  2023-11-13  4:41         ` [PATCH v6 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-11-13  4:41         ` Gowrishankar Muthukrishnan
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-13  4:41 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 274 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  19 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 292 insertions(+), 40 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index bd1b38be60..5e51e17c94 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -346,8 +427,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int buf_size = cfg->buf_size.cur;
@@ -442,20 +524,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -463,12 +581,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -482,13 +620,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -502,10 +650,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -537,13 +698,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -554,10 +755,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -600,13 +799,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index cddcf93c6e..f460b93414 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -69,6 +71,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=2
@@ -88,7 +105,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 3b79694137..36c9594f8c 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -94,10 +94,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -327,7 +325,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
@@ -442,6 +441,7 @@ load_configs(const char *path)
 			test_case->dcoreid = (uint8_t)atoi(dcoreid);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -476,6 +476,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 3d75edd1de..56e0c77e25 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -49,11 +49,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -66,6 +69,6 @@ struct test_configure {
 	uint64_t raddr;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v7 0/4] PCI Dev and SG copy support
  2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                           ` (3 preceding siblings ...)
  2023-11-13  4:41         ` [PATCH v6 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-11-17 12:15         ` Gowrishankar Muthukrishnan
  2023-11-17 12:15           ` [PATCH v7 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
                             ` (4 more replies)
  4 siblings, 5 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-17 12:15 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports below:
 - validate copied memory
 - skip tests if not opted.
v7:
 - PCI patch updated.
Amit Prakash Shukla (2):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 383 +++++++++++++++++++++++++++++++---
 app/test-dma-perf/config.ini  |  56 +++++
 app/test-dma-perf/main.c      | 136 +++++++++++-
 app/test-dma-perf/main.h      |  12 +-
 4 files changed, 545 insertions(+), 42 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v7 1/4] app/dma-perf: add skip support
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
@ 2023-11-17 12:15           ` Gowrishankar Muthukrishnan
  2023-11-20  2:54             ` fengchengwen
  2023-11-17 12:15           ` [PATCH v7 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
                             ` (3 subsequent siblings)
  4 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-17 12:15 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 23 +++++++++++++++++++++++
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 26 insertions(+)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..4d59234b2a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -36,6 +36,8 @@
 ; If you do not specify a result file, one will be generated with the same name as the configuration
 ; file, with the addition of "_result.csv" at the end.
 
+; "skip" To skip a test-case set skip to 1.
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 5f8bab8f45..c74f1d81bd 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -320,6 +320,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -339,6 +340,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -523,6 +531,21 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
 		if (!test_cases[i].is_valid) {
 			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 62085e6e8f..32670151af 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -40,6 +40,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v7 2/4] app/dma-perf: add PCI device support
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2023-11-17 12:15           ` [PATCH v7 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2023-11-17 12:15           ` Gowrishankar Muthukrishnan
  2023-11-17 12:15           ` [PATCH v7 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                             ` (2 subsequent siblings)
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-17 12:15 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
v7:
 - changed cfg->raddr type to uintptr_t to fix 32 bit compilation.
---
 app/test-dma-perf/benchmark.c | 108 +++++++++++++++++++++++++++++++---
 app/test-dma-perf/config.ini  |  37 ++++++++++++
 app/test-dma-perf/main.c      |  67 +++++++++++++++++++++
 app/test-dma-perf/main.h      |   6 ++
 4 files changed, 209 insertions(+), 9 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..eaed224c67 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->dcoreid;
+		qconf->dst_port.pcie.vfid = cfg->vfid;
+		qconf->dst_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->scoreid;
+		qconf->src_port.pcie.vfid = cfg->vfid;
+		qconf->src_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -302,13 +338,22 @@ do_cpu_mem_copy(void *p)
 	return 0;
 }
 
+static void
+dummy_free_ext_buf(void *addr, void *opaque)
+{
+	RTE_SET_USED(addr);
+	RTE_SET_USED(opaque);
+}
+
 static int
 setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -361,16 +406,47 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
+	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+		if (ext_buf_info == NULL) {
+			printf("Error: ext_buf_info malloc failed.\n");
+			return -1;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)cfg->raddr,
+						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)cfg->raddr,
+						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
 	return 0;
 }
 
 void
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -476,6 +552,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
 
 out:
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+		m = srcs;
+	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
+		m = dsts;
+
+	if (m) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_detach_extbuf(m[i]);
+
+		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+			rte_free(m[0]->shinfo);
+	}
+
 	/* free mbufs used in the test */
 	if (srcs != NULL)
 		rte_pktmbuf_free_bulk(srcs, nr_buf);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 4d59234b2a..cddcf93c6e 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,23 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
+; ==================================================================================
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    0 - mem to mem transfer
+;    1 - mem to dev transfer
+;    2 - dev to mem transfer
+; If not specified the default value is 0 (mem to mem transfer).
+
+; "raddr" remote iova address for "mem to dev" and "dev to mem" transfer.
+
+; "scoreid" denotes source PCIe core index.
+; "dcoreid" denotes destination PCIe core index.
+; "pfid" denotes PF-id to be used for data transfer
+; "vfid" denotes VF-id of PF-id to be used for data transfer.
+
+; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
@@ -52,6 +69,26 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=2
+raddr=0x200000000
+scoreid=0
+dcoreid=0
+pfid=0
+vfid=0
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index c74f1d81bd..4671ca5335 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,7 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
 
 #include "main.h"
 
@@ -318,9 +319,11 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -358,6 +361,20 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else
+				test_case->transfer_dir = (uint8_t)atoi(transfer_dir);
+
+			if (test_case->transfer_dir >= RTE_DMA_DIR_DEV_TO_DEV) {
+				printf("Error: Invalid transfer direction configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -369,6 +386,56 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			char *endptr;
+
+			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
+			if (raddr == NULL) {
+				printf("Error: No raddr configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->raddr = strtoull(raddr, &endptr, 16);
+
+			vfid = rte_cfgfile_get_entry(cfgfile, section_name, "vfid");
+			if (vfid == NULL) {
+				printf("Error: No vfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->vfid = (uint16_t)atoi(vfid);
+
+			pfid = rte_cfgfile_get_entry(cfgfile, section_name, "pfid");
+			if (pfid == NULL) {
+				printf("Error: No pfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->pfid = (uint8_t)atoi(pfid);
+
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
+			if (scoreid == NULL) {
+				printf("Error: No scoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->scoreid = (uint8_t)atoi(scoreid);
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
+			if (dcoreid == NULL) {
+				printf("Error: No dcoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->dcoreid = (uint8_t)atoi(dcoreid);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 32670151af..8ac3270fba 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -42,6 +42,7 @@ struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -57,6 +58,11 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	uint8_t scoreid;
+	uint8_t dcoreid;
+	uint8_t pfid;
+	uint16_t vfid;
+	uintptr_t raddr;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v7 3/4] app/dma-perf: validate copied memory
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2023-11-17 12:15           ` [PATCH v7 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
  2023-11-17 12:15           ` [PATCH v7 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2023-11-17 12:15           ` Gowrishankar Muthukrishnan
  2023-11-17 12:15           ` [PATCH v7 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-17 12:15 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index eaed224c67..034461da4e 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -406,6 +407,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
 	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
 		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
@@ -440,7 +446,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint32_t i;
@@ -458,6 +464,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -531,6 +538,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -596,4 +613,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 4671ca5335..4dbba255ed 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -87,20 +87,24 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -145,8 +149,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 8ac3270fba..7dcaa166f2 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -65,6 +65,6 @@ struct test_configure {
 	uintptr_t raddr;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v7 4/4] app/dma-perf: add SG copy support
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                             ` (2 preceding siblings ...)
  2023-11-17 12:15           ` [PATCH v7 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-11-17 12:15           ` Gowrishankar Muthukrishnan
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-17 12:15 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 274 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  19 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 292 insertions(+), 40 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 034461da4e..4530bd98ce 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int buf_size = cfg->buf_size.cur;
@@ -443,20 +525,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -464,12 +582,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -483,13 +621,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -503,10 +651,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -538,13 +699,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -555,10 +756,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -601,13 +800,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index cddcf93c6e..f460b93414 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -69,6 +71,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=2
@@ -88,7 +105,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 4dbba255ed..051b7af9ea 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -94,10 +94,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -327,7 +325,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
@@ -442,6 +441,7 @@ load_configs(const char *path)
 			test_case->dcoreid = (uint8_t)atoi(dcoreid);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -476,6 +476,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 7dcaa166f2..31e0bf71c9 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -48,11 +48,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -65,6 +68,6 @@ struct test_configure {
 	uintptr_t raddr;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v7 1/4] app/dma-perf: add skip support
  2023-11-17 12:15           ` [PATCH v7 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2023-11-20  2:54             ` fengchengwen
  2023-11-22 12:01               ` [EXT] " Amit Prakash Shukla
  0 siblings, 1 reply; 79+ messages in thread
From: fengchengwen @ 2023-11-20  2:54 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
Hi Gowrishankar and Amit,
On 2023/11/17 20:15, Gowrishankar Muthukrishnan wrote:
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> 
> Add support to skip running a dma-perf test-case.
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> ---
>  app/test-dma-perf/config.ini |  2 ++
>  app/test-dma-perf/main.c     | 23 +++++++++++++++++++++++
>  app/test-dma-perf/main.h     |  1 +
>  3 files changed, 26 insertions(+)
> 
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> index b550f4b23f..4d59234b2a 100644
> --- a/app/test-dma-perf/config.ini
> +++ b/app/test-dma-perf/config.ini
> @@ -36,6 +36,8 @@
>  ; If you do not specify a result file, one will be generated with the same name as the configuration
>  ; file, with the addition of "_result.csv" at the end.
>  
> +; "skip" To skip a test-case set skip to 1.
> +
>  [case1]
>  type=DMA_MEM_COPY
>  mem_size=10
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index 5f8bab8f45..c74f1d81bd 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -320,6 +320,7 @@ load_configs(const char *path)
>  	const char *case_type;
>  	const char *lcore_dma;
>  	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
> +	const char *skip;
>  	int args_nr, nb_vp;
>  	bool is_dma;
>  
> @@ -339,6 +340,13 @@ load_configs(const char *path)
>  	for (i = 0; i < nb_sections; i++) {
>  		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
>  		test_case = &test_cases[i];
> +
> +		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
> +		if (skip && (atoi(skip) == 1)) {
> +			test_case->is_skip = true;
> +			continue;
> +		}
> +
>  		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
>  		if (case_type == NULL) {
>  			printf("Error: No case type in case %d, the test will be finished here.\n",
> @@ -523,6 +531,21 @@ main(int argc, char *argv[])
>  
>  	printf("Running cases...\n");
>  	for (i = 0; i < case_nb; i++) {
> +		if (test_cases[i].is_skip) {
> +			printf("Test case %d configured to be skipped.\n\n", i + 1);
> +			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
> +				 i + 1);
> +
> +			fd = fopen(rst_path_ptr, "a");
> +			if (!fd) {
> +				printf("Open output CSV file error.\n");
> +				return 0;
> +			}
> +			output_csv(true);
> +			fclose(fd);
> +			continue;
> +		}
The process is similar to following two cases:
1\ if (!test_cases[i].is_valid)
2\ if (test_cases[i].test_type == TEST_TYPE_NONE)
They just different in error info, suggest abstract one function.
With above fixed,
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
Thanks
Chengwen
> +
>  		if (!test_cases[i].is_valid) {
>  			printf("Invalid test case %d.\n\n", i + 1);
>  			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 62085e6e8f..32670151af 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -40,6 +40,7 @@ struct lcore_dma_map_t {
>  
>  struct test_configure {
>  	bool is_valid;
> +	bool is_skip;
>  	uint8_t test_type;
>  	const char *test_type_str;
>  	uint16_t src_numa_node;
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v8 0/4] PCI Dev and SG copy support
  2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                             ` (3 preceding siblings ...)
  2023-11-17 12:15           ` [PATCH v7 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-11-22 11:06           ` Gowrishankar Muthukrishnan
  2023-11-22 11:06             ` [PATCH v8 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
                               ` (5 more replies)
  4 siblings, 6 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-22 11:06 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng,
	Gowrishankar Muthukrishnan
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports below:
 - validate copied memory
 - skip tests if not opted.
v8:
 - updated skip test patch.
Amit Prakash Shukla (2):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 383 +++++++++++++++++++++++++++++++---
 app/test-dma-perf/config.ini  |  56 +++++
 app/test-dma-perf/main.c      | 161 +++++++++++---
 app/test-dma-perf/main.h      |  12 +-
 4 files changed, 551 insertions(+), 61 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v8 1/4] app/dma-perf: add skip support
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
@ 2023-11-22 11:06             ` Gowrishankar Muthukrishnan
  2023-11-22 11:06             ` [PATCH v8 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
                               ` (4 subsequent siblings)
  5 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-22 11:06 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v8:
 - abstracted csv file write error log
---
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 48 ++++++++++++++++++++++--------------
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..4d59234b2a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -36,6 +36,8 @@
 ; If you do not specify a result file, one will be generated with the same name as the configuration
 ; file, with the addition of "_result.csv" at the end.
 
+; "skip" To skip a test-case set skip to 1.
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 5f8bab8f45..33c3750bb1 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -86,6 +86,19 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
+static int
+open_output_csv(const char *rst_path_ptr)
+{
+	fd = fopen(rst_path_ptr, "a");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 1;
+	}
+	output_csv(true);
+	fclose(fd);
+	return 0;
+}
+
 static void
 run_test_case(struct test_configure *case_cfg)
 {
@@ -320,6 +333,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -339,6 +353,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -523,31 +544,20 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
-		if (!test_cases[i].is_valid) {
-			printf("Invalid test case %d.\n\n", i + 1);
-			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
-		if (test_cases[i].test_type == TEST_TYPE_NONE) {
-			printf("No valid test type in test case %d.\n\n", i + 1);
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 62085e6e8f..32670151af 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -40,6 +40,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v8 2/4] app/dma-perf: add PCI device support
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2023-11-22 11:06             ` [PATCH v8 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2023-11-22 11:06             ` Gowrishankar Muthukrishnan
  2023-11-23  1:12               ` fengchengwen
  2024-02-21  3:26               ` fengchengwen
  2023-11-22 11:06             ` [PATCH v8 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                               ` (3 subsequent siblings)
  5 siblings, 2 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-22 11:06 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng
From: Amit Prakash Shukla <amitprakashs@marvell.com>
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 108 +++++++++++++++++++++++++++++++---
 app/test-dma-perf/config.ini  |  37 ++++++++++++
 app/test-dma-perf/main.c      |  67 +++++++++++++++++++++
 app/test-dma-perf/main.h      |   6 ++
 4 files changed, 209 insertions(+), 9 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..eaed224c67 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->dcoreid;
+		qconf->dst_port.pcie.vfid = cfg->vfid;
+		qconf->dst_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->scoreid;
+		qconf->src_port.pcie.vfid = cfg->vfid;
+		qconf->src_port.pcie.pfid = cfg->pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -302,13 +338,22 @@ do_cpu_mem_copy(void *p)
 	return 0;
 }
 
+static void
+dummy_free_ext_buf(void *addr, void *opaque)
+{
+	RTE_SET_USED(addr);
+	RTE_SET_USED(opaque);
+}
+
 static int
 setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -361,16 +406,47 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
+	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+		if (ext_buf_info == NULL) {
+			printf("Error: ext_buf_info malloc failed.\n");
+			return -1;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)cfg->raddr,
+						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)cfg->raddr,
+						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
 	return 0;
 }
 
 void
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -476,6 +552,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
 
 out:
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+		m = srcs;
+	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
+		m = dsts;
+
+	if (m) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_detach_extbuf(m[i]);
+
+		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+			rte_free(m[0]->shinfo);
+	}
+
 	/* free mbufs used in the test */
 	if (srcs != NULL)
 		rte_pktmbuf_free_bulk(srcs, nr_buf);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 4d59234b2a..cddcf93c6e 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,23 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
+; ==================================================================================
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    0 - mem to mem transfer
+;    1 - mem to dev transfer
+;    2 - dev to mem transfer
+; If not specified the default value is 0 (mem to mem transfer).
+
+; "raddr" remote iova address for "mem to dev" and "dev to mem" transfer.
+
+; "scoreid" denotes source PCIe core index.
+; "dcoreid" denotes destination PCIe core index.
+; "pfid" denotes PF-id to be used for data transfer
+; "vfid" denotes VF-id of PF-id to be used for data transfer.
+
+; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
@@ -52,6 +69,26 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=2
+raddr=0x200000000
+scoreid=0
+dcoreid=0
+pfid=0
+vfid=0
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 33c3750bb1..3eddf2e40a 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,7 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
 
 #include "main.h"
 
@@ -331,9 +332,11 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -371,6 +374,20 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else
+				test_case->transfer_dir = (uint8_t)atoi(transfer_dir);
+
+			if (test_case->transfer_dir >= RTE_DMA_DIR_DEV_TO_DEV) {
+				printf("Error: Invalid transfer direction configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -382,6 +399,56 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			char *endptr;
+
+			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
+			if (raddr == NULL) {
+				printf("Error: No raddr configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->raddr = strtoull(raddr, &endptr, 16);
+
+			vfid = rte_cfgfile_get_entry(cfgfile, section_name, "vfid");
+			if (vfid == NULL) {
+				printf("Error: No vfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->vfid = (uint16_t)atoi(vfid);
+
+			pfid = rte_cfgfile_get_entry(cfgfile, section_name, "pfid");
+			if (pfid == NULL) {
+				printf("Error: No pfid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->pfid = (uint8_t)atoi(pfid);
+
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
+			if (scoreid == NULL) {
+				printf("Error: No scoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->scoreid = (uint8_t)atoi(scoreid);
+		}
+
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
+			if (dcoreid == NULL) {
+				printf("Error: No dcoreid configured for case%d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+			test_case->dcoreid = (uint8_t)atoi(dcoreid);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 32670151af..8ac3270fba 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -42,6 +42,7 @@ struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -57,6 +58,11 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	uint8_t scoreid;
+	uint8_t dcoreid;
+	uint8_t pfid;
+	uint16_t vfid;
+	uintptr_t raddr;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v8 3/4] app/dma-perf: validate copied memory
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2023-11-22 11:06             ` [PATCH v8 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
  2023-11-22 11:06             ` [PATCH v8 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2023-11-22 11:06             ` Gowrishankar Muthukrishnan
  2023-11-23  1:14               ` fengchengwen
  2023-11-22 11:06             ` [PATCH v8 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
                               ` (2 subsequent siblings)
  5 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-22 11:06 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng,
	Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index eaed224c67..034461da4e 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -406,6 +407,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
 	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
 		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
@@ -440,7 +446,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint32_t i;
@@ -458,6 +464,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -531,6 +538,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -596,4 +613,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 3eddf2e40a..1d54173a9c 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -100,20 +100,24 @@ open_output_csv(const char *rst_path_ptr)
 	return 0;
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -158,8 +162,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 8ac3270fba..7dcaa166f2 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -65,6 +65,6 @@ struct test_configure {
 	uintptr_t raddr;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v8 4/4] app/dma-perf: add SG copy support
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                               ` (2 preceding siblings ...)
  2023-11-22 11:06             ` [PATCH v8 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-11-22 11:06             ` Gowrishankar Muthukrishnan
  2024-01-25 12:44               ` fengchengwen
  2024-02-21  3:52               ` fengchengwen
  2023-12-07 10:11             ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
  5 siblings, 2 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-11-22 11:06 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng,
	Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
---
 app/test-dma-perf/benchmark.c | 274 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  19 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 292 insertions(+), 40 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 034461da4e..4530bd98ce 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int buf_size = cfg->buf_size.cur;
@@ -443,20 +525,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = buf_size / src_ptrs;
+		sglen_dst = buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -464,12 +582,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -483,13 +621,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -503,10 +651,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -538,13 +699,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -555,10 +756,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -601,13 +800,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index cddcf93c6e..f460b93414 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -9,6 +9,8 @@
 ; "buf_size" denotes the memory size of a single operation.
 ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
 ;  64 and 4096.
+; "dma_ptrs_src" denotes number of source segments.
+; "dma_ptrs_dst" denotes number of destination segments.
 ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
 
 ; The format for variables is variable=first,last,increment,ADD|MUL.
@@ -69,6 +71,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_ptrs_src=4
+dma_ptrs_dst=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=2
@@ -88,7 +105,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 1d54173a9c..e81eca14e1 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -107,10 +107,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -340,7 +338,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
 	int args_nr, nb_vp;
@@ -455,6 +454,7 @@ load_configs(const char *path)
 			test_case->dcoreid = (uint8_t)atoi(dcoreid);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -489,6 +489,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_src");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_src"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ptrs_dst");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_ptrs_dst"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 7dcaa166f2..31e0bf71c9 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -48,11 +48,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -65,6 +68,6 @@ struct test_configure {
 	uintptr_t raddr;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXT] Re: [PATCH v7 1/4] app/dma-perf: add skip support
  2023-11-20  2:54             ` fengchengwen
@ 2023-11-22 12:01               ` Amit Prakash Shukla
  0 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2023-11-22 12:01 UTC (permalink / raw)
  To: fengchengwen, Gowrishankar Muthukrishnan, dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula
Hi Chengwen,
Thanks for the review and feedback. Changes suggested by you are incorporated as part of v8 send by Gowrishankar. 
In v8, have also removed a redundant check for TEST_TYPE_NONE.
Thanks,
Amit Shukla
> -----Original Message-----
> From: fengchengwen <fengchengwen@huawei.com>
> Sent: Monday, November 20, 2023 8:25 AM
> To: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>;
> dev@dpdk.org
> Cc: Anoob Joseph <anoobj@marvell.com>; Cheng Jiang
> <honest.jiang@foxmail.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>
> Subject: [EXT] Re: [PATCH v7 1/4] app/dma-perf: add skip support
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi Gowrishankar and Amit,
> 
> On 2023/11/17 20:15, Gowrishankar Muthukrishnan wrote:
> > From: Amit Prakash Shukla <amitprakashs@marvell.com>
> >
> > Add support to skip running a dma-perf test-case.
> >
> > Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> > Acked-by: Anoob Joseph <anoobj@marvell.com>
> > ---
> >  app/test-dma-perf/config.ini |  2 ++
> >  app/test-dma-perf/main.c     | 23 +++++++++++++++++++++++
> >  app/test-dma-perf/main.h     |  1 +
> >  3 files changed, 26 insertions(+)
> >
> > diff --git a/app/test-dma-perf/config.ini
> > b/app/test-dma-perf/config.ini index b550f4b23f..4d59234b2a 100644
> > --- a/app/test-dma-perf/config.ini
> > +++ b/app/test-dma-perf/config.ini
> > @@ -36,6 +36,8 @@
> >  ; If you do not specify a result file, one will be generated with the
> > same name as the configuration  ; file, with the addition of "_result.csv" at
> the end.
> >
> > +; "skip" To skip a test-case set skip to 1.
> > +
> >  [case1]
> >  type=DMA_MEM_COPY
> >  mem_size=10
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c index
> > 5f8bab8f45..c74f1d81bd 100644
> > --- a/app/test-dma-perf/main.c
> > +++ b/app/test-dma-perf/main.c
> > @@ -320,6 +320,7 @@ load_configs(const char *path)
> >  	const char *case_type;
> >  	const char *lcore_dma;
> >  	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > *kick_batch_str;
> > +	const char *skip;
> >  	int args_nr, nb_vp;
> >  	bool is_dma;
> >
> > @@ -339,6 +340,13 @@ load_configs(const char *path)
> >  	for (i = 0; i < nb_sections; i++) {
> >  		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
> >  		test_case = &test_cases[i];
> > +
> > +		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
> > +		if (skip && (atoi(skip) == 1)) {
> > +			test_case->is_skip = true;
> > +			continue;
> > +		}
> > +
> >  		case_type = rte_cfgfile_get_entry(cfgfile, section_name,
> "type");
> >  		if (case_type == NULL) {
> >  			printf("Error: No case type in case %d, the test will be
> finished
> > here.\n", @@ -523,6 +531,21 @@ main(int argc, char *argv[])
> >
> >  	printf("Running cases...\n");
> >  	for (i = 0; i < case_nb; i++) {
> > +		if (test_cases[i].is_skip) {
> > +			printf("Test case %d configured to be skipped.\n\n", i
> + 1);
> > +			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip
> the test-case %d\n",
> > +				 i + 1);
> > +
> > +			fd = fopen(rst_path_ptr, "a");
> > +			if (!fd) {
> > +				printf("Open output CSV file error.\n");
> > +				return 0;
> > +			}
> > +			output_csv(true);
> > +			fclose(fd);
> > +			continue;
> > +		}
> 
> The process is similar to following two cases:
> 1\ if (!test_cases[i].is_valid)
> 2\ if (test_cases[i].test_type == TEST_TYPE_NONE)
> 
> They just different in error info, suggest abstract one function.
> 
> With above fixed,
> Acked-by: Chengwen Feng <fengchengwen@huawei.com>
> 
> Thanks
> Chengwen
> 
<snip>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v8 2/4] app/dma-perf: add PCI device support
  2023-11-22 11:06             ` [PATCH v8 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2023-11-23  1:12               ` fengchengwen
  2024-02-21  3:26               ` fengchengwen
  1 sibling, 0 replies; 79+ messages in thread
From: fengchengwen @ 2023-11-23  1:12 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
On 2023/11/22 19:06, Gowrishankar Muthukrishnan wrote:
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> 
> Add support to test performance for "device to memory" and
> "memory to device" data transfer.
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 108 +++++++++++++++++++++++++++++++---
>  app/test-dma-perf/config.ini  |  37 ++++++++++++
>  app/test-dma-perf/main.c      |  67 +++++++++++++++++++++
>  app/test-dma-perf/main.h      |   6 ++
>  4 files changed, 209 insertions(+), 9 deletions(-)
> 
...
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v8 3/4] app/dma-perf: validate copied memory
  2023-11-22 11:06             ` [PATCH v8 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2023-11-23  1:14               ` fengchengwen
  0 siblings, 0 replies; 79+ messages in thread
From: fengchengwen @ 2023-11-23  1:14 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
On 2023/11/22 19:06, Gowrishankar Muthukrishnan wrote:
> Validate copied memory to ensure DMA copy did not fail.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
>  app/test-dma-perf/main.c      | 16 +++++++++++-----
>  app/test-dma-perf/main.h      |  2 +-
>  3 files changed, 32 insertions(+), 7 deletions(-)
> 
...
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v8 0/4] PCI Dev and SG copy support
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                               ` (3 preceding siblings ...)
  2023-11-22 11:06             ` [PATCH v8 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2023-12-07 10:11             ` Gowrishankar Muthukrishnan
  2024-02-05 10:37               ` Gowrishankar Muthukrishnan
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
  5 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2023-12-07 10:11 UTC (permalink / raw)
  To: dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Chengwen Feng
Hi,
Could this patch series be reviewed ? As of now, patches are rebased without any conflicts.
Thanks,
Gowrishankar
> -----Original Message-----
> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Sent: Wednesday, November 22, 2023 4:36 PM
> To: dev@dpdk.org
> Cc: Anoob Joseph <anoobj@marvell.com>; Cheng Jiang
> <honest.jiang@foxmail.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Chengwen Feng
> <fengchengwen@huawei.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>
> Subject: [PATCH v8 0/4] PCI Dev and SG copy support
> 
> Improve dma-perf application to support PCI dev and SG copy, along with
> additional supports below:
>  - validate copied memory
>  - skip tests if not opted.
> 
> v8:
>  - updated skip test patch.
> 
> Amit Prakash Shukla (2):
>   app/dma-perf: add skip support
>   app/dma-perf: add PCI device support
> 
> Gowrishankar Muthukrishnan (2):
>   app/dma-perf: validate copied memory
>   app/dma-perf: add SG copy support
> 
>  app/test-dma-perf/benchmark.c | 383
> +++++++++++++++++++++++++++++++---
>  app/test-dma-perf/config.ini  |  56 +++++
>  app/test-dma-perf/main.c      | 161 +++++++++++---
>  app/test-dma-perf/main.h      |  12 +-
>  4 files changed, 551 insertions(+), 61 deletions(-)
> 
> --
> 2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v8 4/4] app/dma-perf: add SG copy support
  2023-11-22 11:06             ` [PATCH v8 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2024-01-25 12:44               ` fengchengwen
  2024-02-21  3:52               ` fengchengwen
  1 sibling, 0 replies; 79+ messages in thread
From: fengchengwen @ 2024-01-25 12:44 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
LGTM
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
Thanks
On 2023/11/22 19:06, Gowrishankar Muthukrishnan wrote:
> Add SG copy support.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 274 +++++++++++++++++++++++++++++-----
>  app/test-dma-perf/config.ini  |  19 ++-
>  app/test-dma-perf/main.c      |  34 ++++-
>  app/test-dma-perf/main.h      |   5 +-
>  4 files changed, 292 insertions(+), 40 deletions(-)
> 
...
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [PATCH v8 0/4] PCI Dev and SG copy support
  2023-12-07 10:11             ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
@ 2024-02-05 10:37               ` Gowrishankar Muthukrishnan
  0 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-05 10:37 UTC (permalink / raw)
  To: dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Chengwen Feng,
	Jerin Jacob, Thomas Monjalon
[-- Attachment #1: Type: text/plain, Size: 97 bytes --]
Hi,
Could this series be merged as the patches are acked.
Thanks,
Gowrishankar
[-- Attachment #2: Type: text/html, Size: 1996 bytes --]
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v8 2/4] app/dma-perf: add PCI device support
  2023-11-22 11:06             ` [PATCH v8 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
  2023-11-23  1:12               ` fengchengwen
@ 2024-02-21  3:26               ` fengchengwen
  2024-02-27  9:27                 ` [EXT] " Amit Prakash Shukla
  1 sibling, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-02-21  3:26 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev, Amit Prakash Shukla
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Thomas Monjalon
Hi Amit,
There are many commit for dma-perf, I've done a re-review and have a few comments, pls see below:
On 2023/11/22 19:06, Gowrishankar Muthukrishnan wrote:
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> 
> Add support to test performance for "device to memory" and
> "memory to device" data transfer.
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 108 +++++++++++++++++++++++++++++++---
>  app/test-dma-perf/config.ini  |  37 ++++++++++++
>  app/test-dma-perf/main.c      |  67 +++++++++++++++++++++
>  app/test-dma-perf/main.h      |   6 ++
>  4 files changed, 209 insertions(+), 9 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 9b1f58c78c..eaed224c67 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
>  #endif
>  }
>  
> +static int
> +vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
> +		    struct test_configure *cfg)
> +{
> +	struct rte_dma_info info;
> +
> +	qconf->direction = cfg->transfer_dir;
> +
> +	rte_dma_info_get(dev_id, &info);
> +	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
> +		return -1;
> +
> +	qconf->nb_desc = cfg->ring_size.cur;
> +
> +	switch (qconf->direction) {
> +	case RTE_DMA_DIR_MEM_TO_DEV:
> +		qconf->dst_port.pcie.vfen = 1;
> +		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
> +		qconf->dst_port.pcie.coreid = cfg->dcoreid;
> +		qconf->dst_port.pcie.vfid = cfg->vfid;
> +		qconf->dst_port.pcie.pfid = cfg->pfid;
> +		break;
> +	case RTE_DMA_DIR_DEV_TO_MEM:
> +		qconf->src_port.pcie.vfen = 1;
> +		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
> +		qconf->src_port.pcie.coreid = cfg->scoreid;
> +		qconf->src_port.pcie.vfid = cfg->vfid;
> +		qconf->src_port.pcie.pfid = cfg->pfid;
> +		break;
> +	case RTE_DMA_DIR_MEM_TO_MEM:
> +	case RTE_DMA_DIR_DEV_TO_DEV:
> +		break;
> +	}
> +
> +	return 0;
> +}
> +
>  /* Configuration of device. */
>  static void
> -configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
>  {
>  	uint16_t vchan = 0;
>  	struct rte_dma_info info;
>  	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> -	struct rte_dma_vchan_conf qconf = {
> -		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> -		.nb_desc = ring_size
> -	};
> +	struct rte_dma_vchan_conf qconf = { 0 };
> +
> +	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
>  
>  	if (rte_dma_configure(dev_id, &dev_config) != 0)
>  		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> @@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
>  static int
>  config_dmadevs(struct test_configure *cfg)
>  {
> -	uint32_t ring_size = cfg->ring_size.cur;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>  	uint32_t nb_workers = ldm->cnt;
>  	uint32_t i;
> @@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
>  		}
>  
>  		ldm->dma_ids[i] = dev_id;
> -		configure_dmadev_queue(dev_id, ring_size);
> +		configure_dmadev_queue(dev_id, cfg);
>  		++nb_dmadevs;
>  	}
>  
> @@ -302,13 +338,22 @@ do_cpu_mem_copy(void *p)
>  	return 0;
>  }
>  
> +static void
> +dummy_free_ext_buf(void *addr, void *opaque)
> +{
> +	RTE_SET_USED(addr);
> +	RTE_SET_USED(opaque);
> +}
> +
>  static int
>  setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  			struct rte_mbuf ***dsts)
>  {
> +	static struct rte_mbuf_ext_shared_info *ext_buf_info;
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	unsigned int nr_sockets;
>  	uint32_t nr_buf = cfg->nr_buf;
> +	uint32_t i;
>  
>  	nr_sockets = rte_socket_count();
>  	if (cfg->src_numa_node >= nr_sockets ||
> @@ -361,16 +406,47 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		return -1;
>  	}
>  
> +	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
> +	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
> +		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
> +		if (ext_buf_info == NULL) {
> +			printf("Error: ext_buf_info malloc failed.\n");
> +			return -1;
> +		}
> +	}
> +
> +	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> +		ext_buf_info->free_cb = dummy_free_ext_buf;
> +		ext_buf_info->fcb_opaque = NULL;
> +		for (i = 0; i < nr_buf; i++) {
> +			/* Using mbuf structure to hold remote iova address. */
> +			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)cfg->raddr,
> +						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
> +			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
> +		}
> +	}
> +
> +	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
> +		ext_buf_info->free_cb = dummy_free_ext_buf;
> +		ext_buf_info->fcb_opaque = NULL;
> +		for (i = 0; i < nr_buf; i++) {
> +			/* Using mbuf structure to hold remote iova address. */
> +			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)cfg->raddr,
> +						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
> +			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
> +		}
> +	}
> +
>  	return 0;
>  }
>  
>  void
>  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  {
> -	uint16_t i;
> +	uint32_t i;
>  	uint32_t offset;
>  	unsigned int lcore_id = 0;
> -	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	uint16_t kick_batch = cfg->kick_batch.cur;
> @@ -476,6 +552,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
>  
>  out:
> +
> +	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
> +		m = srcs;
> +	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
> +		m = dsts;
> +
> +	if (m) {
> +		for (i = 0; i < nr_buf; i++)
> +			rte_pktmbuf_detach_extbuf(m[i]);
> +
> +		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
> +			rte_free(m[0]->shinfo);
> +	}
> +
>  	/* free mbufs used in the test */
>  	if (srcs != NULL)
>  		rte_pktmbuf_free_bulk(srcs, nr_buf);
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> index 4d59234b2a..cddcf93c6e 100644
> --- a/app/test-dma-perf/config.ini
> +++ b/app/test-dma-perf/config.ini
> @@ -38,6 +38,23 @@
>  
>  ; "skip" To skip a test-case set skip to 1.
>  
> +; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
> +; ==================================================================================
> +; "direction" denotes the direction of data transfer. It can take 3 values:
> +;    0 - mem to mem transfer
> +;    1 - mem to dev transfer
> +;    2 - dev to mem transfer
I prefer readable string not number, for examples:
mem2mem
mem2dev
dev2mem
> +; If not specified the default value is 0 (mem to mem transfer).
> +
> +; "raddr" remote iova address for "mem to dev" and "dev to mem" transfer.
> +
> +; "scoreid" denotes source PCIe core index.
> +; "dcoreid" denotes destination PCIe core index.
> +; "pfid" denotes PF-id to be used for data transfer
> +; "vfid" denotes VF-id of PF-id to be used for data transfer.
too many entries, and it all about pcie, the 'struct rte_dma_port_param' future may support other bus.
Suggest the entry is vchan_dev, user could input some thing like
1. vchan_dev=bus=pcie,coreid=1,pfid=0,vfid=1,addr=xxx
   add add descriptor, only valid when direction is one of mem2dev or dev2mem
It could use kvargs library to parse the value of entry vchan_dev
> +
> +; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
> +
>  [case1]
>  type=DMA_MEM_COPY
>  mem_size=10
> @@ -52,6 +69,26 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
>  [case2]
> +skip=1
> +type=DMA_MEM_COPY
> +direction=2
> +raddr=0x200000000
> +scoreid=0
> +dcoreid=0
> +pfid=0
> +vfid=0
> +mem_size=10
> +buf_size=64,4096,2,MUL
> +dma_ring_size=1024
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +test_seconds=2
> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> +eal_args=--in-memory --file-prefix=test
> +
> +[case3]
>  type=CPU_MEM_COPY
>  mem_size=10
>  buf_size=64,8192,2,MUL
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index 33c3750bb1..3eddf2e40a 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -16,6 +16,7 @@
>  #include <rte_cfgfile.h>
>  #include <rte_string_fns.h>
>  #include <rte_lcore.h>
> +#include <rte_dmadev.h>
>  
>  #include "main.h"
>  
> @@ -331,9 +332,11 @@ load_configs(const char *path)
>  	struct test_configure *test_case;
>  	char section_name[CFG_NAME_LEN];
>  	const char *case_type;
> +	const char *transfer_dir;
>  	const char *lcore_dma;
>  	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
>  	const char *skip;
> +	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
>  	int args_nr, nb_vp;
>  	bool is_dma;
>  
> @@ -371,6 +374,20 @@ load_configs(const char *path)
>  		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
>  			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
>  			test_case->test_type_str = DMA_MEM_COPY;
> +
> +			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
> +			if (transfer_dir == NULL) {
> +				printf("Transfer direction not configured."
> +					" Defaulting it to MEM to MEM transfer.\n");
> +				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
> +			} else
> +				test_case->transfer_dir = (uint8_t)atoi(transfer_dir);
> +
> +			if (test_case->transfer_dir >= RTE_DMA_DIR_DEV_TO_DEV) {
> +				printf("Error: Invalid transfer direction configured.\n");
> +				test_case->is_valid = false;
> +				continue;
> +			}
>  			is_dma = true;
>  		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
>  			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
> @@ -382,6 +399,56 @@ load_configs(const char *path)
>  			continue;
>  		}
>  
> +		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
> +			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> +			char *endptr;
> +
> +			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
> +			if (raddr == NULL) {
> +				printf("Error: No raddr configured for case%d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +			test_case->raddr = strtoull(raddr, &endptr, 16);
> +
> +			vfid = rte_cfgfile_get_entry(cfgfile, section_name, "vfid");
> +			if (vfid == NULL) {
> +				printf("Error: No vfid configured for case%d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +			test_case->vfid = (uint16_t)atoi(vfid);
> +
> +			pfid = rte_cfgfile_get_entry(cfgfile, section_name, "pfid");
> +			if (pfid == NULL) {
> +				printf("Error: No pfid configured for case%d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +			test_case->pfid = (uint8_t)atoi(pfid);
> +
> +		}
> +
> +		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> +			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
> +			if (scoreid == NULL) {
> +				printf("Error: No scoreid configured for case%d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +			test_case->scoreid = (uint8_t)atoi(scoreid);
> +		}
> +
> +		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
> +			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
> +			if (dcoreid == NULL) {
> +				printf("Error: No dcoreid configured for case%d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +			test_case->dcoreid = (uint8_t)atoi(dcoreid);
> +		}
> +
suggest add a subfunction to wrap parsing device's config.
>  		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
>  								section_name, "src_numa_node"));
>  		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 32670151af..8ac3270fba 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -42,6 +42,7 @@ struct test_configure {
>  	bool is_valid;
>  	bool is_skip;
>  	uint8_t test_type;
> +	uint8_t transfer_dir;
>  	const char *test_type_str;
>  	uint16_t src_numa_node;
>  	uint16_t dst_numa_node;
> @@ -57,6 +58,11 @@ struct test_configure {
>  	uint16_t test_secs;
>  	const char *eal_args;
>  	uint8_t scenario_id;
> +	uint8_t scoreid;
> +	uint8_t dcoreid;
> +	uint8_t pfid;
> +	uint16_t vfid;
> +	uintptr_t raddr;
suggest create new struct:
struct test_vchan_dev_config {
	struct rte_dma_port_param port;
	uintptr_t addr;
};
So defined as:
  struct test_vchan_dev_config vchan_dev;
Thanks
>  };
>  
>  void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v8 4/4] app/dma-perf: add SG copy support
  2023-11-22 11:06             ` [PATCH v8 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
  2024-01-25 12:44               ` fengchengwen
@ 2024-02-21  3:52               ` fengchengwen
  2024-02-27 16:09                 ` [EXT] " Gowrishankar Muthukrishnan
  1 sibling, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-02-21  3:52 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla
Hi Gowrishankar,
On 2023/11/22 19:06, Gowrishankar Muthukrishnan wrote:
> Add SG copy support.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> ---
>  app/test-dma-perf/benchmark.c | 274 +++++++++++++++++++++++++++++-----
>  app/test-dma-perf/config.ini  |  19 ++-
>  app/test-dma-perf/main.c      |  34 ++++-
>  app/test-dma-perf/main.h      |   5 +-
>  4 files changed, 292 insertions(+), 40 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 034461da4e..4530bd98ce 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -46,6 +46,10 @@ struct lcore_params {
>  	uint16_t test_secs;
>  	struct rte_mbuf **srcs;
>  	struct rte_mbuf **dsts;
> +	struct rte_dma_sge *src_sges;
> +	struct rte_dma_sge *dst_sges;
> +	uint8_t src_ptrs;
> +	uint8_t dst_ptrs;
>  	volatile struct worker_info worker_info;
>  };
>  
> @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
>  }
>  
>  static void
> -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
> -			float memory, float bandwidth, float mops, bool is_dma)
> +output_result(struct test_configure *cfg, struct lcore_params *para,
> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
> +			uint32_t nr_buf, float memory, float bandwidth, float mops)
>  {
> -	if (is_dma)
> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
> -				lcore_id, dma_name, ring_size, kick_batch);
> -	else
> +	uint16_t ring_size = cfg->ring_size.cur;
> +	uint8_t scenario_id = cfg->scenario_id;
> +	uint32_t lcore_id = para->lcore_id;
> +	char *dma_name = para->dma_name;
> +
> +	if (cfg->is_dma) {
> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
> +		       dma_name, ring_size, kick_batch);
> +		if (cfg->is_sg)
> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> +			       para->src_ptrs, para->dst_ptrs);
> +		printf(".\n");
> +	} else {
>  		printf("lcore %u\n", lcore_id);
> +	}
>  
>  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
>  			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
>  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
>  
> -	if (is_dma)
> +	if (cfg->is_dma)
>  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
>  			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
>  			nr_buf, memory, ave_cycle, bandwidth, mops);
> @@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
>  
>  /* Configuration of device. */
>  static void
> -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
>  {
>  	uint16_t vchan = 0;
>  	struct rte_dma_info info;
> @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
>  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
>  				dev_id);
>  
> +	if (info.max_sges < ptrs_max)
> +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
> +				dev_id);
> +
>  	if (rte_dma_start(dev_id) != 0)
>  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
>  }
> @@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
>  	uint32_t i;
>  	int dev_id;
>  	uint16_t nb_dmadevs = 0;
> +	uint8_t ptrs_max = 0;
>  	char *dma_name;
>  
> +	if (cfg->is_sg)
> +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
> +
>  	for (i = 0; i < ldm->cnt; i++) {
>  		dma_name = ldm->dma_names[i];
>  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
>  		}
>  
>  		ldm->dma_ids[i] = dev_id;
> -		configure_dmadev_queue(dev_id, cfg);
> +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
>  		++nb_dmadevs;
>  	}
>  
> @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
>  }
>  
>  static inline int
> -do_dma_mem_copy(void *p)
> +do_dma_plain_mem_copy(void *p)
>  {
>  	struct lcore_params *para = (struct lcore_params *)p;
>  	volatile struct worker_info *worker_info = &(para->worker_info);
> @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
>  	return 0;
>  }
>  
> +static inline int
> +do_dma_sg_mem_copy(void *p)
> +{
> +	struct lcore_params *para = (struct lcore_params *)p;
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	struct rte_dma_sge *src_sges = para->src_sges;
> +	struct rte_dma_sge *dst_sges = para->dst_sges;
> +	const uint16_t kick_batch = para->kick_batch;
> +	const uint8_t src_ptrs = para->src_ptrs;
> +	const uint8_t dst_ptrs = para->dst_ptrs;
> +	const uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint64_t async_cnt = 0;
> +	uint32_t poll_cnt = 0;
> +	uint16_t nr_cpl;
> +	uint32_t i, j;
> +	int ret;
> +
> +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		j = 0;
> +		for (i = 0; i < nr_buf; i++) {
> +dma_copy:
> +			ret = rte_dma_copy_sg(dev_id, 0,
> +				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
> +				src_ptrs, dst_ptrs, 0);
> +			if (unlikely(ret < 0)) {
> +				if (ret == -ENOSPC) {
> +					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> +					goto dma_copy;
> +				} else
> +					error_exit(dev_id);
> +			}
> +			async_cnt++;
> +			j++;
> +
> +			if ((async_cnt % kick_batch) == 0)
> +				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +
>  static inline int
>  do_cpu_mem_copy(void *p)
>  {
> @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
>  }
>  
>  static int
> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> -			struct rte_mbuf ***dsts)
> +setup_memory_env(struct test_configure *cfg,
> +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
> +			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
>  {
>  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
>  	unsigned int buf_size = cfg->buf_size.cur;
> @@ -443,20 +525,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		}
>  	}
>  
> +	if (cfg->is_sg) {
> +		uint8_t src_ptrs = cfg->src_ptrs;
> +		uint8_t dst_ptrs = cfg->dst_ptrs;
> +		uint32_t sglen_src, sglen_dst;
> +
> +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
> +					RTE_CACHE_LINE_SIZE);
> +		if (*src_sges == NULL) {
> +			printf("Error: src_sges array malloc failed.\n");
> +			return -1;
> +		}
> +
> +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
> +					RTE_CACHE_LINE_SIZE);
> +		if (*dst_sges == NULL) {
> +			printf("Error: dst_sges array malloc failed.\n");
> +			return -1;
> +		}
> +
> +		sglen_src = buf_size / src_ptrs;
> +		sglen_dst = buf_size / dst_ptrs;
> +
> +		for (i = 0; i < nr_buf; i++) {
> +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
> +			(*src_sges)[i].length = sglen_src;
> +			if (!((i+1) % src_ptrs))
> +				(*src_sges)[i].length += (buf_size % src_ptrs);
> +
> +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
> +			(*dst_sges)[i].length = sglen_dst;
> +			if (!((i+1) % dst_ptrs))
> +				(*dst_sges)[i].length += (buf_size % dst_ptrs);
> +		}
> +	}
> +
>  	return 0;
>  }
>  
>  int
> -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
> +mem_copy_benchmark(struct test_configure *cfg)
>  {
> -	uint32_t i;
> +	uint32_t i, j;
>  	uint32_t offset;
>  	unsigned int lcore_id = 0;
>  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
> +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	const uint32_t mcore_id = rte_get_main_lcore();
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	uint16_t kick_batch = cfg->kick_batch.cur;
> -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
>  	uint16_t nb_workers = ldm->cnt;
>  	uint16_t test_secs = cfg->test_secs;
>  	float memory = 0;
> @@ -464,12 +582,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	uint32_t avg_cycles_total;
>  	float mops, mops_total;
>  	float bandwidth, bandwidth_total;
> +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
> +	uint32_t nr_buf;
>  	int ret = 0;
>  
> -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +	/* Align number of buffers according to workers count */
> +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> +	nr_buf -= (nr_buf % nb_workers);
> +	if (cfg->is_sg) {
> +		nr_buf /= nb_workers;
> +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
> +		nr_buf *= nb_workers;
> +
> +		if (cfg->dst_ptrs > cfg->src_ptrs) {
> +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
> +			nr_sgdst = nr_buf;
> +		} else {
> +			nr_sgsrc = nr_buf;
> +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
> +		}
> +	}
> +
> +	cfg->nr_buf = nr_buf;
> +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
>  		goto out;
>  
> -	if (is_dma)
> +	if (cfg->is_dma)
>  		if (config_dmadevs(cfg) < 0)
>  			goto out;
>  
> @@ -483,13 +621,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	for (i = 0; i < nb_workers; i++) {
>  		lcore_id = ldm->lcores[i];
> +		if (lcore_id == mcore_id) {
> +			printf("lcore parameters can not use main core id %d\n", mcore_id);
> +			goto out;
> +		}
> +
> +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
> +			printf("lcore parameters can not use offline core id %d\n", lcore_id);
> +			goto out;
> +		}
> +
>  		offset = nr_buf / nb_workers * i;
>  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
>  		if (lcores[i] == NULL) {
>  			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
>  			break;
>  		}
> -		if (is_dma) {
> +		if (cfg->is_dma) {
>  			lcores[i]->dma_name = ldm->dma_names[i];
>  			lcores[i]->dev_id = ldm->dma_ids[i];
>  			lcores[i]->kick_batch = kick_batch;
> @@ -503,10 +651,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  		lcores[i]->scenario_id = cfg->scenario_id;
>  		lcores[i]->lcore_id = lcore_id;
>  
> -		if (is_dma)
> -			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
> -		else
> +		if (cfg->is_sg) {
> +			lcores[i]->src_ptrs = cfg->src_ptrs;
> +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
> +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
> +			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
> +		}
> +
> +		if (cfg->is_dma) {
> +			if (!cfg->is_sg)
> +				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
> +					lcore_id);
> +			else
> +				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
> +					lcore_id);
> +		} else {
>  			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
> +		}
>  	}
>  
>  	while (1) {
> @@ -538,13 +699,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	rte_eal_mp_wait_lcore();
>  
> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> -			   rte_pktmbuf_mtod(dsts[i], void *),
> -			   cfg->buf_size.cur) != 0) {
> -			printf("Copy validation fails for buffer number %d\n", i);
> -			ret = -1;
> -			goto out;
> +	if (!cfg->is_sg) {
> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> +					rte_pktmbuf_mtod(dsts[i], void *),
> +					cfg->buf_size.cur) != 0) {
> +				printf("Copy validation fails for buffer number %d\n", i);
> +				ret = -1;
> +				goto out;
> +			}
> +		}
> +	} else {
> +		size_t src_remsz = buf_size % cfg->src_ptrs;
> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> +		size_t src_sz = buf_size / cfg->src_ptrs;
> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> +		uint8_t src[buf_size], dst[buf_size];
> +		uint8_t *sbuf, *dbuf, *ptr;
> +
> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
> +			sbuf = src;
> +			dbuf = dst;
> +			ptr = NULL;
> +
> +			for (j = 0; j < cfg->src_ptrs; j++) {
> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
> +				memcpy(sbuf, ptr, src_sz);
> +				sbuf += src_sz;
> +			}
> +
> +			if (src_remsz)
> +				memcpy(sbuf, ptr + src_sz, src_remsz);
> +
> +			for (j = 0; j < cfg->dst_ptrs; j++) {
> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
> +				memcpy(dbuf, ptr, dst_sz);
> +				dbuf += dst_sz;
> +			}
> +
> +			if (dst_remsz)
> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> +
> +			if (memcmp(src, dst, buf_size) != 0) {
> +				printf("SG Copy validation fails for buffer number %d\n",
> +					i * cfg->src_ptrs);
> +				ret = -1;
> +				goto out;
> +			}
>  		}
>  	}
>  
> @@ -555,10 +756,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
>  			lcores[i]->worker_info.test_cpl,
>  			&memory, &avg_cycles, &bandwidth, &mops);
> -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
> -					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
> -					avg_cycles, buf_size, nr_buf / nb_workers, memory,
> -					bandwidth, mops, is_dma);
> +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
> +			nr_buf / nb_workers, memory, bandwidth, mops);
>  		mops_total += mops;
>  		bandwidth_total += bandwidth;
>  		avg_cycles_total += avg_cycles;
> @@ -601,13 +800,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	rte_mempool_free(dst_pool);
>  	dst_pool = NULL;
>  
> +	/* free sges for mbufs */
> +	rte_free(src_sges);
> +	src_sges = NULL;
> +
> +	rte_free(dst_sges);
> +	dst_sges = NULL;
> +
>  	/* free the worker parameters */
>  	for (i = 0; i < nb_workers; i++) {
>  		rte_free(lcores[i]);
>  		lcores[i] = NULL;
>  	}
>  
> -	if (is_dma) {
> +	if (cfg->is_dma) {
>  		for (i = 0; i < nb_workers; i++) {
>  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
>  			rte_dma_stop(ldm->dma_ids[i]);
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> index cddcf93c6e..f460b93414 100644
> --- a/app/test-dma-perf/config.ini
> +++ b/app/test-dma-perf/config.ini
> @@ -9,6 +9,8 @@
>  ; "buf_size" denotes the memory size of a single operation.
>  ; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
>  ;  64 and 4096.
> +; "dma_ptrs_src" denotes number of source segments.
> +; "dma_ptrs_dst" denotes number of destination segments.
The two entry is hard to understand, I prefer src_sge_num/dst_sge_num (better names are also welcome.)
and add descript about:
1. only valid when the type is DMA_MEM_COPY
2. enable scatter-gather copy test when both are set.
Thanks.
>  ; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
>  
>  ; The format for variables is variable=first,last,increment,ADD|MUL.
> @@ -69,6 +71,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
>  [case2]
> +type=DMA_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +dma_ptrs_src=4
> +dma_ptrs_dst=1
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +test_seconds=2
> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> +eal_args=--in-memory --file-prefix=test
> +
> +[case3]
>  skip=1
>  type=DMA_MEM_COPY
>  direction=2
> @@ -88,7 +105,7 @@ test_seconds=2
>  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
> -[case3]
> +[case4]
>  type=CPU_MEM_COPY
>  mem_size=10
>  buf_size=64,8192,2,MUL
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index 1d54173a9c..e81eca14e1 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -107,10 +107,8 @@ run_test_case(struct test_configure *case_cfg)
>  
>  	switch (case_cfg->test_type) {
>  	case TEST_TYPE_DMA_MEM_COPY:
> -		ret = mem_copy_benchmark(case_cfg, true);
> -		break;
>  	case TEST_TYPE_CPU_MEM_COPY:
> -		ret = mem_copy_benchmark(case_cfg, false);
> +		ret = mem_copy_benchmark(case_cfg);
>  		break;
>  	default:
>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> @@ -340,7 +338,8 @@ load_configs(const char *path)
>  	const char *case_type;
>  	const char *transfer_dir;
>  	const char *lcore_dma;
> -	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
> +	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
> +		*src_ptrs_str, *dst_ptrs_str;
>  	const char *skip;
>  	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
>  	int args_nr, nb_vp;
> @@ -455,6 +454,7 @@ load_configs(const char *path)
>  			test_case->dcoreid = (uint8_t)atoi(dcoreid);
>  		}
>  
> +		test_case->is_dma = is_dma;
>  		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
>  								section_name, "src_numa_node"));
>  		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> @@ -489,6 +489,32 @@ load_configs(const char *path)
>  			} else if (args_nr == 4)
>  				nb_vp++;
>  
> +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
> +								"dma_ptrs_src");
> +			if (src_ptrs_str != NULL) {
> +				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dma_ptrs_src"));
> +			}
> +
> +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
> +								"dma_ptrs_dst");
> +			if (dst_ptrs_str != NULL) {
> +				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dma_ptrs_dst"));
> +			}
> +
> +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
> +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
> +				printf("parse dma_ptrs_src, dma_ptrs_dst error in case %d.\n",
> +					i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
> +				test_case->is_sg = true;
> +			} else {
> +				test_case->is_sg = false;
> +			}
> +
>  			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
>  			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
>  			if (args_nr < 0) {
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 7dcaa166f2..31e0bf71c9 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -48,11 +48,14 @@ struct test_configure {
>  	uint16_t dst_numa_node;
>  	uint16_t opcode;
>  	bool is_dma;
> +	bool is_sg;
>  	struct lcore_dma_map_t lcore_dma_map;
>  	struct test_configure_entry mem_size;
>  	struct test_configure_entry buf_size;
>  	struct test_configure_entry ring_size;
>  	struct test_configure_entry kick_batch;
> +	uint8_t src_ptrs;
> +	uint8_t dst_ptrs;
>  	uint8_t cache_flush;
>  	uint32_t nr_buf;
>  	uint16_t test_secs;
> @@ -65,6 +68,6 @@ struct test_configure {
>  	uintptr_t raddr;
>  };
>  
> -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> +int mem_copy_benchmark(struct test_configure *cfg);
>  
>  #endif /* MAIN_H */
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXT] Re: [PATCH v8 2/4] app/dma-perf: add PCI device support
  2024-02-21  3:26               ` fengchengwen
@ 2024-02-27  9:27                 ` Amit Prakash Shukla
  0 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27  9:27 UTC (permalink / raw)
  To: fengchengwen, Gowrishankar Muthukrishnan, dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Thomas Monjalon
Hi Chengwen,
Thanks for the review and feedback. I will send the next version with suggested changes.
Thanks,
Amit Shukla
> -----Original Message-----
> From: fengchengwen <fengchengwen@huawei.com>
> Sent: Wednesday, February 21, 2024 8:56 AM
> To: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>;
> dev@dpdk.org; Amit Prakash Shukla <amitprakashs@marvell.com>
> Cc: Anoob Joseph <anoobj@marvell.com>; Cheng Jiang
> <honest.jiang@foxmail.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Thomas Monjalon <thomas@monjalon.net>
> Subject: [EXT] Re: [PATCH v8 2/4] app/dma-perf: add PCI device support
> 
> External Email
> 
> ----------------------------------------------------------------------
<snip>
> >  	if (srcs != NULL)
> >  		rte_pktmbuf_free_bulk(srcs, nr_buf); diff --git
> > a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini index
> > 4d59234b2a..cddcf93c6e 100644
> > --- a/app/test-dma-perf/config.ini
> > +++ b/app/test-dma-perf/config.ini
> > @@ -38,6 +38,23 @@
> >
> >  ; "skip" To skip a test-case set skip to 1.
> >
> > +; Parameters to be configured for data transfers from "mem to dev" and
> "dev to mem":
> > +;
> >
> +==================================================================
> ===
> > +============= ; "direction" denotes the direction of data transfer.
> > +It can take 3 values:
> > +;    0 - mem to mem transfer
> > +;    1 - mem to dev transfer
> > +;    2 - dev to mem transfer
> 
> I prefer readable string not number, for examples:
> mem2mem
> mem2dev
> dev2mem
> 
> > +; If not specified the default value is 0 (mem to mem transfer).
> > +
> > +; "raddr" remote iova address for "mem to dev" and "dev to mem" transfer.
> > +
> > +; "scoreid" denotes source PCIe core index.
> > +; "dcoreid" denotes destination PCIe core index.
> > +; "pfid" denotes PF-id to be used for data transfer ; "vfid" denotes
> > +VF-id of PF-id to be used for data transfer.
> 
> too many entries, and it all about pcie, the 'struct rte_dma_port_param' future
> may support other bus.
> 
> Suggest the entry is vchan_dev, user could input some thing like 1.
> vchan_dev=bus=pcie,coreid=1,pfid=0,vfid=1,addr=xxx
>    add add descriptor, only valid when direction is one of mem2dev or
> dev2mem
> 
> It could use kvargs library to parse the value of entry vchan_dev
> 
> > +
> > +; =========== End of "mem to dev" and "dev to mem" config parameters.
> > +==============
> > +
> >  [case1]
> >  type=DMA_MEM_COPY
> >  mem_size=10
> > @@ -52,6 +69,26 @@ lcore_dma=lcore10@0000:00:04.2,
> > lcore11@0000:00:04.3  eal_args=--in-memory --file-prefix=test
> >
<snip>
> > +
> > +		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> > +			scoreid = rte_cfgfile_get_entry(cfgfile, section_name,
> "scoreid");
> > +			if (scoreid == NULL) {
> > +				printf("Error: No scoreid configured for
> case%d.\n", i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			}
> > +			test_case->scoreid = (uint8_t)atoi(scoreid);
> > +		}
> > +
> > +		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
> > +			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name,
> "dcoreid");
> > +			if (dcoreid == NULL) {
> > +				printf("Error: No dcoreid configured for
> case%d.\n", i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			}
> > +			test_case->dcoreid = (uint8_t)atoi(dcoreid);
> > +		}
> > +
> 
> suggest add a subfunction to wrap parsing device's config.
> 
> 
> >  		test_case->src_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> >
> 	section_name, "src_numa_node"));
> >  		test_case->dst_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h index
> > 32670151af..8ac3270fba 100644
> > --- a/app/test-dma-perf/main.h
> > +++ b/app/test-dma-perf/main.h
> > @@ -42,6 +42,7 @@ struct test_configure {
> >  	bool is_valid;
> >  	bool is_skip;
> >  	uint8_t test_type;
> > +	uint8_t transfer_dir;
> >  	const char *test_type_str;
> >  	uint16_t src_numa_node;
> >  	uint16_t dst_numa_node;
> > @@ -57,6 +58,11 @@ struct test_configure {
> >  	uint16_t test_secs;
> >  	const char *eal_args;
> >  	uint8_t scenario_id;
> > +	uint8_t scoreid;
> > +	uint8_t dcoreid;
> > +	uint8_t pfid;
> > +	uint16_t vfid;
> > +	uintptr_t raddr;
> 
> suggest create new struct:
> struct test_vchan_dev_config {
> 	struct rte_dma_port_param port;
> 	uintptr_t addr;
> };
> 
> So defined as:
>   struct test_vchan_dev_config vchan_dev;
> 
> Thanks
> 
> >  };
> >
> >  void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> >
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v9 0/4] PCI Dev and SG copy support
  2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
                               ` (4 preceding siblings ...)
  2023-12-07 10:11             ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
@ 2024-02-27 16:00             ` Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 1/4] app/dma-perf: add skip support Amit Prakash Shukla
                                 ` (5 more replies)
  5 siblings, 6 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 16:00 UTC (permalink / raw)
  Cc: dev, jerinj, anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan, Chengwen Feng,
	Amit Prakash Shukla
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports as below:
 - validate copied memory
 - skip tests if not opted.
v9:
- Review suggestions.
Amit Prakash Shukla (2):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 392 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  58 +++++
 app/test-dma-perf/main.c      | 171 ++++++++++++---
 app/test-dma-perf/main.h      |  13 +-
 4 files changed, 570 insertions(+), 64 deletions(-)
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v9 1/4] app/dma-perf: add skip support
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
@ 2024-02-27 16:00               ` Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 2/4] app/dma-perf: add PCI device support Amit Prakash Shukla
                                 ` (4 subsequent siblings)
  5 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 16:00 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan, Amit Prakash Shukla
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 48 ++++++++++++++++++++++--------------
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..4d59234b2a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -36,6 +36,8 @@
 ; If you do not specify a result file, one will be generated with the same name as the configuration
 ; file, with the addition of "_result.csv" at the end.
 
+; "skip" To skip a test-case set skip to 1.
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 544784df50..e9e40e72e7 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -86,6 +86,19 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
+static int
+open_output_csv(const char *rst_path_ptr)
+{
+	fd = fopen(rst_path_ptr, "a");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 1;
+	}
+	output_csv(true);
+	fclose(fd);
+	return 0;
+}
+
 static void
 run_test_case(struct test_configure *case_cfg)
 {
@@ -322,6 +335,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -341,6 +355,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -525,31 +546,20 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
-		if (!test_cases[i].is_valid) {
-			printf("Invalid test case %d.\n\n", i + 1);
-			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
-		if (test_cases[i].test_type == TEST_TYPE_NONE) {
-			printf("No valid test type in test case %d.\n\n", i + 1);
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 62085e6e8f..32670151af 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -40,6 +40,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v9 2/4] app/dma-perf: add PCI device support
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 1/4] app/dma-perf: add skip support Amit Prakash Shukla
@ 2024-02-27 16:00               ` Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
                                 ` (3 subsequent siblings)
  5 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 16:00 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan, Amit Prakash Shukla
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v9:
- PCI config parsing using kvargs.
 app/test-dma-perf/benchmark.c | 117 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  33 ++++++++++
 app/test-dma-perf/main.c      |  77 ++++++++++++++++++++++
 app/test-dma-perf/main.h      |   7 ++
 4 files changed, 222 insertions(+), 12 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..4370d71134 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
+		qconf->dst_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
+		qconf->dst_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
+		qconf->src_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
+		qconf->src_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -302,13 +338,23 @@ do_cpu_mem_copy(void *p)
 	return 0;
 }
 
+static void
+dummy_free_ext_buf(void *addr, void *opaque)
+{
+	RTE_SET_USED(addr);
+	RTE_SET_USED(opaque);
+}
+
 static int
 setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
-	unsigned int buf_size = cfg->buf_size.cur;
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
+	unsigned int cur_buf_size = cfg->buf_size.cur;
+	unsigned int buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -321,7 +367,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->src_numa_node);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
@@ -332,7 +378,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->dst_numa_node);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
@@ -361,16 +407,49 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
+	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+		if (ext_buf_info == NULL) {
+			printf("Error: ext_buf_info malloc failed.\n");
+			return -1;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), (rte_iova_t)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), (rte_iova_t)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
 	return 0;
 }
 
 void
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -476,6 +555,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
 
 out:
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+		m = srcs;
+	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
+		m = dsts;
+
+	if (m) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_detach_extbuf(m[i]);
+
+		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+			rte_free(m[0]->shinfo);
+	}
+
 	/* free mbufs used in the test */
 	if (srcs != NULL)
 		rte_pktmbuf_free_bulk(srcs, nr_buf);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 4d59234b2a..9c8221025e 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,23 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
+; ==================================================================================
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    mem2mem - mem to mem transfer
+;    mem2dev - mem to dev transfer
+;    dev2mem - dev to mem transfer
+; If not specified the default value is mem2mem transfer.
+
+; "vchan_dev comma separated bus related config parameter for mem2dev and dev2mem dma transfer. Ex:"
+; vchan_dev=raddr=0x400000,coreid=1,pfid=2,vfid=3
+;    "raddr" remote iova address for mem2dev and dev2mem transfer.
+;    "coreid" denotes PCIe core index.
+;    "pfid" denotes PF-id to be used for data transfer
+;    "vfid" denotes VF-id of PF-id to be used for data transfer.
+
+; =========== End of "mem2dev" and "dev2mem" config parameters. ==============
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
@@ -52,6 +69,22 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=dev2mem
+vchan_dev=raddr=0x200000000,coreid=1,pfid=2,vfid=3
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e9e40e72e7..051f76a6f9 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,8 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
+#include <rte_kvargs.h>
 
 #include "main.h"
 
@@ -325,6 +327,28 @@ parse_entry(const char *value, struct test_configure_entry *entry)
 	return args_nr;
 }
 
+static int populate_pcie_config(const char *key, const char *value, void *test)
+{
+	struct test_configure *test_case = (struct test_configure *)test;
+	char *endptr;
+	int ret = 0;
+
+	if (strcmp(key, "raddr") == 0)
+		test_case->vchan_dev.raddr = strtoull(value, &endptr, 16);
+	else if (strcmp(key, "coreid") == 0)
+		test_case->vchan_dev.port.pcie.coreid = (uint8_t)atoi(value);
+	else if (strcmp(key, "vfid") == 0)
+		test_case->vchan_dev.port.pcie.vfid = (uint16_t)atoi(value);
+	else if (strcmp(key, "pfid") == 0)
+		test_case->vchan_dev.port.pcie.pfid = (uint16_t)atoi(value);
+	else {
+		printf("Invalid config param: %s\n", key);
+		ret = -1;
+	}
+
+	return ret;
+}
+
 static uint16_t
 load_configs(const char *path)
 {
@@ -333,9 +357,12 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	struct rte_kvargs *kvlist;
+	const char *vchan_dev;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -373,6 +400,22 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else {
+				if (strcmp(transfer_dir, "mem2dev") == 0)
+					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_DEV;
+				else if (strcmp(transfer_dir, "dev2mem") == 0)
+					test_case->transfer_dir = RTE_DMA_DIR_DEV_TO_MEM;
+				else {
+					printf("Defaulting the test to MEM to MEM transfer\n");
+					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+				}
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -384,6 +427,40 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			vchan_dev = rte_cfgfile_get_entry(cfgfile, section_name, "vchan_dev");
+			if (vchan_dev == NULL) {
+				printf("Transfer direction mem2dev and dev2mem"
+				       " vhcan_dev shall be configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
+
+			kvlist = rte_kvargs_parse(vchan_dev, NULL);
+			if (kvlist == NULL) {
+				printf("rte_kvargs_parse() error");
+				test_case->is_valid = false;
+				continue;
+			}
+
+			if (rte_kvargs_process(kvlist, NULL, populate_pcie_config,
+					       (void *)test_case) < 0) {
+				printf("rte_kvargs_process() error\n");
+				rte_kvargs_free(kvlist);
+				test_case->is_valid = false;
+				continue;
+			}
+
+			if (!test_case->vchan_dev.raddr) {
+				printf("For mem2dev and dev2mem configure raddr\n");
+				rte_kvargs_free(kvlist);
+				test_case->is_valid = false;
+				continue;
+			}
+			rte_kvargs_free(kvlist);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 32670151af..745c24b7fe 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -38,10 +38,16 @@ struct lcore_dma_map_t {
 	uint16_t cnt;
 };
 
+struct test_vchan_dev_config {
+	struct rte_dma_port_param port;
+	uintptr_t raddr;
+};
+
 struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -57,6 +63,7 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	struct test_vchan_dev_config vchan_dev;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v9 3/4] app/dma-perf: validate copied memory
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 1/4] app/dma-perf: add skip support Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 2/4] app/dma-perf: add PCI device support Amit Prakash Shukla
@ 2024-02-27 16:00               ` Amit Prakash Shukla
  2024-02-27 16:00               ` [PATCH v9 4/4] app/dma-perf: add SG copy support Amit Prakash Shukla
                                 ` (2 subsequent siblings)
  5 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 16:00 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
 app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 4370d71134..0047e2f4b8 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -407,6 +408,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
 	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
 		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
@@ -443,7 +449,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint32_t i;
@@ -461,6 +467,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -534,6 +541,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -599,4 +616,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 051f76a6f9..df05bcd7df 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -101,20 +101,24 @@ open_output_csv(const char *rst_path_ptr)
 	return 0;
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -159,8 +163,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 745c24b7fe..1123e7524a 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -66,6 +66,6 @@ struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v9 4/4] app/dma-perf: add SG copy support
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
                                 ` (2 preceding siblings ...)
  2024-02-27 16:00               ` [PATCH v9 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
@ 2024-02-27 16:00               ` Amit Prakash Shukla
  2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
  2024-02-27 18:56               ` [PATCH v10 4/4] app/dma-perf: add " Amit Prakash Shukla
  5 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 16:00 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v9:
- SG config variables renamed.
 app/test-dma-perf/benchmark.c | 278 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  25 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 300 insertions(+), 42 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0047e2f4b8..25ed6fa6d0 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int cur_buf_size = cfg->buf_size.cur;
@@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	}
 
 	for (i = 0; i < nr_buf; i++) {
-		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
-		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
 	}
 
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
@@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = cur_buf_size / src_ptrs;
+		sglen_dst = cur_buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (cur_buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (cur_buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 9c8221025e..28f6c9d1db 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,14 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for SG copy:
+; ========================================
+; "dma_src_sge" denotes number of source segments.
+; "dma_dst_sge" denotes number of destination segments.
+;
+; For SG copy, both the parameters need to be configured and they are valid only
+; when type is DMA_MEM_COPY.
+;
 ; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
 ; ==================================================================================
 ; "direction" denotes the direction of data transfer. It can take 3 values:
@@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_src_sge=4
+dma_dst_sge=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=dev2mem
@@ -84,7 +107,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index df05bcd7df..a27e4c9429 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -365,7 +363,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	struct rte_kvargs *kvlist;
 	const char *vchan_dev;
@@ -467,6 +466,7 @@ load_configs(const char *path)
 			rte_kvargs_free(kvlist);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -501,6 +501,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_src_sge");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_src_sge"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_dst_sge");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_dst_sge"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_src_sge, dma_dst_sge error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 1123e7524a..baf149b72b 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -53,11 +53,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -66,6 +69,6 @@ struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXT] Re: [PATCH v8 4/4] app/dma-perf: add SG copy support
  2024-02-21  3:52               ` fengchengwen
@ 2024-02-27 16:09                 ` Gowrishankar Muthukrishnan
  0 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-27 16:09 UTC (permalink / raw)
  To: fengchengwen, dev
  Cc: Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla
> > diff --git a/app/test-dma-perf/config.ini
> > b/app/test-dma-perf/config.ini index cddcf93c6e..f460b93414 100644
> > --- a/app/test-dma-perf/config.ini
> > +++ b/app/test-dma-perf/config.ini
> > @@ -9,6 +9,8 @@
> >  ; "buf_size" denotes the memory size of a single operation.
> >  ; "dma_ring_size" denotes the dma ring buffer size. It should be must
> > be a power of two, and between  ;  64 and 4096.
> > +; "dma_ptrs_src" denotes number of source segments.
> > +; "dma_ptrs_dst" denotes number of destination segments.
> 
> The two entry is hard to understand, I prefer src_sge_num/dst_sge_num
> (better names are also welcome.) and add descript about:
> 1. only valid when the type is DMA_MEM_COPY 2. enable scatter-gather copy
> test when both are set.
> 
Yes Chengwen, I have named variables dma_src_sge and dma_dst_sge to fit context.
Please check https://patches.dpdk.org/project/dpdk/patch/20240227160031.3931694-5-amitprakashs@marvell.com/ .
Thanks,
Gowrishankar
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v10 0/4] PCI Dev and SG copy support
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
                                 ` (3 preceding siblings ...)
  2024-02-27 16:00               ` [PATCH v9 4/4] app/dma-perf: add SG copy support Amit Prakash Shukla
@ 2024-02-27 18:35               ` Amit Prakash Shukla
  2024-02-27 18:35                 ` [PATCH v10 1/4] app/dma-perf: add skip support Amit Prakash Shukla
                                   ` (3 more replies)
  2024-02-27 18:56               ` [PATCH v10 4/4] app/dma-perf: add " Amit Prakash Shukla
  5 siblings, 4 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 18:35 UTC (permalink / raw)
  Cc: dev, jerinj, anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan, Chengwen Feng,
	Amit Prakash Shukla
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports as below:
 - validate copied memory
 - skip tests if not opted.
v10:
- Review suggestions.
- v9 sent again.
Amit Prakash Shukla (2):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
Gowrishankar Muthukrishnan (2):
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 392 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  58 +++++
 app/test-dma-perf/main.c      | 171 ++++++++++++---
 app/test-dma-perf/main.h      |  13 +-
 4 files changed, 570 insertions(+), 64 deletions(-)
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v10 1/4] app/dma-perf: add skip support
  2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
@ 2024-02-27 18:35                 ` Amit Prakash Shukla
  2024-02-27 18:35                 ` [PATCH v10 2/4] app/dma-perf: add PCI device support Amit Prakash Shukla
                                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 18:35 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan, Amit Prakash Shukla
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 48 ++++++++++++++++++++++--------------
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..4d59234b2a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -36,6 +36,8 @@
 ; If you do not specify a result file, one will be generated with the same name as the configuration
 ; file, with the addition of "_result.csv" at the end.
 
+; "skip" To skip a test-case set skip to 1.
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 544784df50..e9e40e72e7 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -86,6 +86,19 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
+static int
+open_output_csv(const char *rst_path_ptr)
+{
+	fd = fopen(rst_path_ptr, "a");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 1;
+	}
+	output_csv(true);
+	fclose(fd);
+	return 0;
+}
+
 static void
 run_test_case(struct test_configure *case_cfg)
 {
@@ -322,6 +335,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -341,6 +355,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -525,31 +546,20 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
-		if (!test_cases[i].is_valid) {
-			printf("Invalid test case %d.\n\n", i + 1);
-			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
-		if (test_cases[i].test_type == TEST_TYPE_NONE) {
-			printf("No valid test type in test case %d.\n\n", i + 1);
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 62085e6e8f..32670151af 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -40,6 +40,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v10 2/4] app/dma-perf: add PCI device support
  2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
  2024-02-27 18:35                 ` [PATCH v10 1/4] app/dma-perf: add skip support Amit Prakash Shukla
@ 2024-02-27 18:35                 ` Amit Prakash Shukla
  2024-02-27 18:35                 ` [PATCH v10 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
  3 siblings, 0 replies; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 18:35 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan, Amit Prakash Shukla
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v10:
- PCI config parsing using kvargs.
 app/test-dma-perf/benchmark.c | 117 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  33 ++++++++++
 app/test-dma-perf/main.c      |  77 ++++++++++++++++++++++
 app/test-dma-perf/main.h      |   7 ++
 4 files changed, 222 insertions(+), 12 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..4370d71134 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
+		qconf->dst_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
+		qconf->dst_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
+		qconf->src_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
+		qconf->src_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -302,13 +338,23 @@ do_cpu_mem_copy(void *p)
 	return 0;
 }
 
+static void
+dummy_free_ext_buf(void *addr, void *opaque)
+{
+	RTE_SET_USED(addr);
+	RTE_SET_USED(opaque);
+}
+
 static int
 setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
-	unsigned int buf_size = cfg->buf_size.cur;
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
+	unsigned int cur_buf_size = cfg->buf_size.cur;
+	unsigned int buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -321,7 +367,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->src_numa_node);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
@@ -332,7 +378,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->dst_numa_node);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
@@ -361,16 +407,49 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
+	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+		if (ext_buf_info == NULL) {
+			printf("Error: ext_buf_info malloc failed.\n");
+			return -1;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), (rte_iova_t)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), (rte_iova_t)(cfg->vchan_dev.raddr +
+						 (i * buf_size)), 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
 	return 0;
 }
 
 void
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -476,6 +555,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
 
 out:
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+		m = srcs;
+	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
+		m = dsts;
+
+	if (m) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_detach_extbuf(m[i]);
+
+		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+			rte_free(m[0]->shinfo);
+	}
+
 	/* free mbufs used in the test */
 	if (srcs != NULL)
 		rte_pktmbuf_free_bulk(srcs, nr_buf);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 4d59234b2a..9c8221025e 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,23 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
+; ==================================================================================
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    mem2mem - mem to mem transfer
+;    mem2dev - mem to dev transfer
+;    dev2mem - dev to mem transfer
+; If not specified the default value is mem2mem transfer.
+
+; "vchan_dev comma separated bus related config parameter for mem2dev and dev2mem dma transfer. Ex:"
+; vchan_dev=raddr=0x400000,coreid=1,pfid=2,vfid=3
+;    "raddr" remote iova address for mem2dev and dev2mem transfer.
+;    "coreid" denotes PCIe core index.
+;    "pfid" denotes PF-id to be used for data transfer
+;    "vfid" denotes VF-id of PF-id to be used for data transfer.
+
+; =========== End of "mem2dev" and "dev2mem" config parameters. ==============
+
 [case1]
 type=DMA_MEM_COPY
 mem_size=10
@@ -52,6 +69,22 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=dev2mem
+vchan_dev=raddr=0x200000000,coreid=1,pfid=2,vfid=3
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e9e40e72e7..051f76a6f9 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,8 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
+#include <rte_kvargs.h>
 
 #include "main.h"
 
@@ -325,6 +327,28 @@ parse_entry(const char *value, struct test_configure_entry *entry)
 	return args_nr;
 }
 
+static int populate_pcie_config(const char *key, const char *value, void *test)
+{
+	struct test_configure *test_case = (struct test_configure *)test;
+	char *endptr;
+	int ret = 0;
+
+	if (strcmp(key, "raddr") == 0)
+		test_case->vchan_dev.raddr = strtoull(value, &endptr, 16);
+	else if (strcmp(key, "coreid") == 0)
+		test_case->vchan_dev.port.pcie.coreid = (uint8_t)atoi(value);
+	else if (strcmp(key, "vfid") == 0)
+		test_case->vchan_dev.port.pcie.vfid = (uint16_t)atoi(value);
+	else if (strcmp(key, "pfid") == 0)
+		test_case->vchan_dev.port.pcie.pfid = (uint16_t)atoi(value);
+	else {
+		printf("Invalid config param: %s\n", key);
+		ret = -1;
+	}
+
+	return ret;
+}
+
 static uint16_t
 load_configs(const char *path)
 {
@@ -333,9 +357,12 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	struct rte_kvargs *kvlist;
+	const char *vchan_dev;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -373,6 +400,22 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else {
+				if (strcmp(transfer_dir, "mem2dev") == 0)
+					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_DEV;
+				else if (strcmp(transfer_dir, "dev2mem") == 0)
+					test_case->transfer_dir = RTE_DMA_DIR_DEV_TO_MEM;
+				else {
+					printf("Defaulting the test to MEM to MEM transfer\n");
+					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+				}
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -384,6 +427,40 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			vchan_dev = rte_cfgfile_get_entry(cfgfile, section_name, "vchan_dev");
+			if (vchan_dev == NULL) {
+				printf("Transfer direction mem2dev and dev2mem"
+				       " vhcan_dev shall be configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
+
+			kvlist = rte_kvargs_parse(vchan_dev, NULL);
+			if (kvlist == NULL) {
+				printf("rte_kvargs_parse() error");
+				test_case->is_valid = false;
+				continue;
+			}
+
+			if (rte_kvargs_process(kvlist, NULL, populate_pcie_config,
+					       (void *)test_case) < 0) {
+				printf("rte_kvargs_process() error\n");
+				rte_kvargs_free(kvlist);
+				test_case->is_valid = false;
+				continue;
+			}
+
+			if (!test_case->vchan_dev.raddr) {
+				printf("For mem2dev and dev2mem configure raddr\n");
+				rte_kvargs_free(kvlist);
+				test_case->is_valid = false;
+				continue;
+			}
+			rte_kvargs_free(kvlist);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 32670151af..745c24b7fe 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -38,10 +38,16 @@ struct lcore_dma_map_t {
 	uint16_t cnt;
 };
 
+struct test_vchan_dev_config {
+	struct rte_dma_port_param port;
+	uintptr_t raddr;
+};
+
 struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -57,6 +63,7 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	struct test_vchan_dev_config vchan_dev;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v10 3/4] app/dma-perf: validate copied memory
  2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
  2024-02-27 18:35                 ` [PATCH v10 1/4] app/dma-perf: add skip support Amit Prakash Shukla
  2024-02-27 18:35                 ` [PATCH v10 2/4] app/dma-perf: add PCI device support Amit Prakash Shukla
@ 2024-02-27 18:35                 ` Amit Prakash Shukla
  2024-02-28  8:10                   ` fengchengwen
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
  3 siblings, 1 reply; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 18:35 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
 app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 4370d71134..0047e2f4b8 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -407,6 +408,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
 	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
 		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
@@ -443,7 +449,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint32_t i;
@@ -461,6 +467,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -534,6 +541,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+			   rte_pktmbuf_mtod(dsts[i], void *),
+			   cfg->buf_size.cur) != 0) {
+			printf("Copy validation fails for buffer number %d\n", i);
+			ret = -1;
+			goto out;
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -599,4 +616,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 051f76a6f9..df05bcd7df 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -101,20 +101,24 @@ open_output_csv(const char *rst_path_ptr)
 	return 0;
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -159,8 +163,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 745c24b7fe..1123e7524a 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -66,6 +66,6 @@ struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [PATCH v10 4/4] app/dma-perf: add SG copy support
  2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
                                 ` (4 preceding siblings ...)
  2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
@ 2024-02-27 18:56               ` Amit Prakash Shukla
  2024-02-28  9:31                 ` fengchengwen
  5 siblings, 1 reply; 79+ messages in thread
From: Amit Prakash Shukla @ 2024-02-27 18:56 UTC (permalink / raw)
  To: Cheng Jiang, Chengwen Feng
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v10:
- SG config variables renamed.
 app/test-dma-perf/benchmark.c | 278 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  25 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 300 insertions(+), 42 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0047e2f4b8..25ed6fa6d0 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int cur_buf_size = cfg->buf_size.cur;
@@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	}
 
 	for (i = 0; i < nr_buf; i++) {
-		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
-		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
 	}
 
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
@@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = cur_buf_size / src_ptrs;
+		sglen_dst = cur_buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (cur_buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (cur_buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 9c8221025e..28f6c9d1db 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,14 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for SG copy:
+; ========================================
+; "dma_src_sge" denotes number of source segments.
+; "dma_dst_sge" denotes number of destination segments.
+;
+; For SG copy, both the parameters need to be configured and they are valid only
+; when type is DMA_MEM_COPY.
+;
 ; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
 ; ==================================================================================
 ; "direction" denotes the direction of data transfer. It can take 3 values:
@@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_src_sge=4
+dma_dst_sge=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=dev2mem
@@ -84,7 +107,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index df05bcd7df..a27e4c9429 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -365,7 +363,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	struct rte_kvargs *kvlist;
 	const char *vchan_dev;
@@ -467,6 +466,7 @@ load_configs(const char *path)
 			rte_kvargs_free(kvlist);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -501,6 +501,32 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_src_sge");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_src_sge"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_dst_sge");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_dst_sge"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_src_sge, dma_dst_sge error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 1123e7524a..baf149b72b 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -53,11 +53,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -66,6 +69,6 @@ struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v10 3/4] app/dma-perf: validate copied memory
  2024-02-27 18:35                 ` [PATCH v10 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
@ 2024-02-28  8:10                   ` fengchengwen
  2024-02-28  9:09                     ` [EXT] " Gowrishankar Muthukrishnan
  0 siblings, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-02-28  8:10 UTC (permalink / raw)
  To: Amit Prakash Shukla, Cheng Jiang, Gowrishankar Muthukrishnan
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson, Pavan Nikhilesh
Hi Gowrishankar,
On 2024/2/28 2:35, Amit Prakash Shukla wrote:
> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> 
> Validate copied memory to ensure DMA copy did not fail.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> Acked-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
>  app/test-dma-perf/benchmark.c | 21 ++++++++++++++++++++-
>  app/test-dma-perf/main.c      | 16 +++++++++++-----
>  app/test-dma-perf/main.h      |  2 +-
>  3 files changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 4370d71134..0047e2f4b8 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -12,6 +12,7 @@
>  #include <rte_dmadev.h>
>  #include <rte_malloc.h>
>  #include <rte_lcore.h>
> +#include <rte_random.h>
>  
>  #include "main.h"
>  
> @@ -407,6 +408,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		return -1;
>  	}
>  
> +	for (i = 0; i < nr_buf; i++) {
> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> +	}
> +
>  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
>  	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
>  		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
> @@ -443,7 +449,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  	return 0;
>  }
>  
> -void
> +int
>  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  {
>  	uint32_t i;
> @@ -461,6 +467,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	uint32_t avg_cycles_total;
>  	float mops, mops_total;
>  	float bandwidth, bandwidth_total;
> +	int ret = 0;
>  
>  	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
>  		goto out;
> @@ -534,6 +541,16 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	rte_eal_mp_wait_lcore();
>  
> +	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> +		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> +			   rte_pktmbuf_mtod(dsts[i], void *),
> +			   cfg->buf_size.cur) != 0) {
> +			printf("Copy validation fails for buffer number %d\n", i);
For non-mem2mem DMA, like mem2dev or dev2mem, the device host address may not direct accessable by CPU
(if could may need mmap).
So pls restrict it only mem2mem, or drop this commit.
Thanks
> +			ret = -1;
> +			goto out;
> +		}
> +	}
> +
>  	mops_total = 0;
>  	bandwidth_total = 0;
>  	avg_cycles_total = 0;
> @@ -599,4 +616,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  			rte_dma_stop(ldm->dma_ids[i]);
>  		}
>  	}
> +
> +	return ret;
>  }
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index 051f76a6f9..df05bcd7df 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -101,20 +101,24 @@ open_output_csv(const char *rst_path_ptr)
>  	return 0;
>  }
>  
> -static void
> +static int
>  run_test_case(struct test_configure *case_cfg)
>  {
> +	int ret = 0;
> +
>  	switch (case_cfg->test_type) {
>  	case TEST_TYPE_DMA_MEM_COPY:
> -		mem_copy_benchmark(case_cfg, true);
> +		ret = mem_copy_benchmark(case_cfg, true);
>  		break;
>  	case TEST_TYPE_CPU_MEM_COPY:
> -		mem_copy_benchmark(case_cfg, false);
> +		ret = mem_copy_benchmark(case_cfg, false);
>  		break;
>  	default:
>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
>  		break;
>  	}
> +
> +	return ret;
>  }
>  
>  static void
> @@ -159,8 +163,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
>  		case_cfg->scenario_id++;
>  		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
>  
> -		run_test_case(case_cfg);
> -		output_csv(false);
> +		if (run_test_case(case_cfg) < 0)
> +			printf("\nTest fails! skipping this scenario.\n");
> +		else
> +			output_csv(false);
>  
>  		if (var_entry->op == OP_ADD)
>  			var_entry->cur += var_entry->incr;
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 745c24b7fe..1123e7524a 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -66,6 +66,6 @@ struct test_configure {
>  	struct test_vchan_dev_config vchan_dev;
>  };
>  
> -void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> +int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
>  
>  #endif /* MAIN_H */
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXT] Re: [PATCH v10 3/4] app/dma-perf: validate copied memory
  2024-02-28  8:10                   ` fengchengwen
@ 2024-02-28  9:09                     ` Gowrishankar Muthukrishnan
  0 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-28  9:09 UTC (permalink / raw)
  To: fengchengwen, Amit Prakash Shukla, Cheng Jiang
  Cc: dev, Jerin Jacob, Anoob Joseph, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula
Hi Fengchengwen,
> > +	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> > +		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> > +			   rte_pktmbuf_mtod(dsts[i], void *),
> > +			   cfg->buf_size.cur) != 0) {
> > +			printf("Copy validation fails for buffer number %d\n",
> i);
> 
> For non-mem2mem DMA, like mem2dev or dev2mem, the device host
> address may not direct accessable by CPU (if could may need mmap).
> 
This has been checked in 4/4 patch as (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM). Would you still need it in this patch only ?.
Thanks,
Gowrishankar
> So pls restrict it only mem2mem, or drop this commit.
> 
> Thanks
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
  2024-02-27 18:56               ` [PATCH v10 4/4] app/dma-perf: add " Amit Prakash Shukla
@ 2024-02-28  9:31                 ` fengchengwen
  2024-02-29 13:16                   ` [EXT] " Gowrishankar Muthukrishnan
  0 siblings, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-02-28  9:31 UTC (permalink / raw)
  To: Amit Prakash Shukla, Cheng Jiang
  Cc: dev, jerinj, anoobj, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Gowrishankar Muthukrishnan
Hi Gowrishankar,
On 2024/2/28 2:56, Amit Prakash Shukla wrote:
> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> 
> Add SG copy support.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> Acked-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
> v10:
> - SG config variables renamed.
> 
>  app/test-dma-perf/benchmark.c | 278 +++++++++++++++++++++++++++++-----
>  app/test-dma-perf/config.ini  |  25 ++-
>  app/test-dma-perf/main.c      |  34 ++++-
>  app/test-dma-perf/main.h      |   5 +-
>  4 files changed, 300 insertions(+), 42 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 0047e2f4b8..25ed6fa6d0 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -46,6 +46,10 @@ struct lcore_params {
>  	uint16_t test_secs;
>  	struct rte_mbuf **srcs;
>  	struct rte_mbuf **dsts;
> +	struct rte_dma_sge *src_sges;
> +	struct rte_dma_sge *dst_sges;
> +	uint8_t src_ptrs;
> +	uint8_t dst_ptrs;
1. src/dst_ptrs -> src/dst_nb_sge
2. How about wrap these four fields as a struct?
>  	volatile struct worker_info worker_info;
>  };
>  
> @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
>  }
>  
>  static void
> -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
> -			float memory, float bandwidth, float mops, bool is_dma)
> +output_result(struct test_configure *cfg, struct lcore_params *para,
> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
> +			uint32_t nr_buf, float memory, float bandwidth, float mops)
>  {
> -	if (is_dma)
> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
> -				lcore_id, dma_name, ring_size, kick_batch);
> -	else
> +	uint16_t ring_size = cfg->ring_size.cur;
> +	uint8_t scenario_id = cfg->scenario_id;
> +	uint32_t lcore_id = para->lcore_id;
> +	char *dma_name = para->dma_name;
> +
> +	if (cfg->is_dma) {
> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
> +		       dma_name, ring_size, kick_batch);
> +		if (cfg->is_sg)
> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> +			       para->src_ptrs, para->dst_ptrs);
DMA src sges: %u DMA dst sges: %u
I think we should add a column which title maybe misc, some like sg-src[4]-dst[1],
and later we may add fill test, then this field could be pattern-0x12345678
And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit, if the DMA was
worked in non-mem2mem direction, we could add simple descriptor of direction and pcie.info
in the above misc column.
> +		printf(".\n");
> +	} else {
>  		printf("lcore %u\n", lcore_id);
> +	}
>  
>  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
>  			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
>  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
>  
> -	if (is_dma)
> +	if (cfg->is_dma)
>  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
>  			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
>  			nr_buf, memory, ave_cycle, bandwidth, mops);
> @@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
>  
>  /* Configuration of device. */
>  static void
> -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
>  {
>  	uint16_t vchan = 0;
>  	struct rte_dma_info info;
> @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
>  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
>  				dev_id);
>  
> +	if (info.max_sges < ptrs_max)
> +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
"Error with unsupport max_sges on device id %u.\n"
> +				dev_id);
> +
>  	if (rte_dma_start(dev_id) != 0)
>  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
>  }
> @@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
>  	uint32_t i;
>  	int dev_id;
>  	uint16_t nb_dmadevs = 0;
> +	uint8_t ptrs_max = 0;
It hard to understand, how about nb_sge?
>  	char *dma_name;
>  
> +	if (cfg->is_sg)
> +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
> +
>  	for (i = 0; i < ldm->cnt; i++) {
>  		dma_name = ldm->dma_names[i];
>  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
>  		}
>  
>  		ldm->dma_ids[i] = dev_id;
> -		configure_dmadev_queue(dev_id, cfg);
> +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
>  		++nb_dmadevs;
>  	}
>  
> @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
>  }
>  
>  static inline int
> -do_dma_mem_copy(void *p)
> +do_dma_plain_mem_copy(void *p)
>  {
>  	struct lcore_params *para = (struct lcore_params *)p;
>  	volatile struct worker_info *worker_info = &(para->worker_info);
> @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
>  	return 0;
>  }
>  
> +static inline int
> +do_dma_sg_mem_copy(void *p)
> +{
> +	struct lcore_params *para = (struct lcore_params *)p;
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	struct rte_dma_sge *src_sges = para->src_sges;
> +	struct rte_dma_sge *dst_sges = para->dst_sges;
> +	const uint16_t kick_batch = para->kick_batch;
> +	const uint8_t src_ptrs = para->src_ptrs;
> +	const uint8_t dst_ptrs = para->dst_ptrs;
> +	const uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint64_t async_cnt = 0;
> +	uint32_t poll_cnt = 0;
> +	uint16_t nr_cpl;
> +	uint32_t i, j;
> +	int ret;
> +
> +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		j = 0;
> +		for (i = 0; i < nr_buf; i++) {
> +dma_copy:
> +			ret = rte_dma_copy_sg(dev_id, 0,
> +				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
> +				src_ptrs, dst_ptrs, 0);
> +			if (unlikely(ret < 0)) {
> +				if (ret == -ENOSPC) {
> +					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> +					goto dma_copy;
> +				} else
> +					error_exit(dev_id);
> +			}
> +			async_cnt++;
> +			j++;
> +
> +			if ((async_cnt % kick_batch) == 0)
> +				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +
>  static inline int
>  do_cpu_mem_copy(void *p)
>  {
> @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
>  }
>  
>  static int
> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> -			struct rte_mbuf ***dsts)
> +setup_memory_env(struct test_configure *cfg,
> +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
> +			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
>  {
>  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
>  	unsigned int cur_buf_size = cfg->buf_size.cur;
> @@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  	}
>  
>  	for (i = 0; i < nr_buf; i++) {
> -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
> -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
>  	}
>  
>  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
> @@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		}
>  	}
>  
> +	if (cfg->is_sg) {
> +		uint8_t src_ptrs = cfg->src_ptrs;
> +		uint8_t dst_ptrs = cfg->dst_ptrs;
> +		uint32_t sglen_src, sglen_dst;
> +
> +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
> +					RTE_CACHE_LINE_SIZE);
> +		if (*src_sges == NULL) {
> +			printf("Error: src_sges array malloc failed.\n");
> +			return -1;
> +		}
> +
> +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
> +					RTE_CACHE_LINE_SIZE);
> +		if (*dst_sges == NULL) {
> +			printf("Error: dst_sges array malloc failed.\n");
> +			return -1;
> +		}
> +
> +		sglen_src = cur_buf_size / src_ptrs;
> +		sglen_dst = cur_buf_size / dst_ptrs;
> +
> +		for (i = 0; i < nr_buf; i++) {
> +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
> +			(*src_sges)[i].length = sglen_src;
> +			if (!((i+1) % src_ptrs))
> +				(*src_sges)[i].length += (cur_buf_size % src_ptrs);
> +
> +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
> +			(*dst_sges)[i].length = sglen_dst;
> +			if (!((i+1) % dst_ptrs))
> +				(*dst_sges)[i].length += (cur_buf_size % dst_ptrs);
> +		}
> +	}
> +
>  	return 0;
>  }
>  
>  int
> -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
> +mem_copy_benchmark(struct test_configure *cfg)
>  {
> -	uint32_t i;
> +	uint32_t i, j;
>  	uint32_t offset;
>  	unsigned int lcore_id = 0;
>  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
> +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	const uint32_t mcore_id = rte_get_main_lcore();
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	uint16_t kick_batch = cfg->kick_batch.cur;
> -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
>  	uint16_t nb_workers = ldm->cnt;
>  	uint16_t test_secs = cfg->test_secs;
>  	float memory = 0;
> @@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	uint32_t avg_cycles_total;
>  	float mops, mops_total;
>  	float bandwidth, bandwidth_total;
> +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
> +	uint32_t nr_buf;
>  	int ret = 0;
>  
> -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +	/* Align number of buffers according to workers count */
> +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> +	nr_buf -= (nr_buf % nb_workers);
> +	if (cfg->is_sg) {
> +		nr_buf /= nb_workers;
> +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
> +		nr_buf *= nb_workers;
> +
> +		if (cfg->dst_ptrs > cfg->src_ptrs) {
> +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
> +			nr_sgdst = nr_buf;
> +		} else {
> +			nr_sgsrc = nr_buf;
> +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
> +		}
> +	}
pls move to a subfunction
> +
> +	cfg->nr_buf = nr_buf;
> +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
>  		goto out;
>  
> -	if (is_dma)
> +	if (cfg->is_dma)
>  		if (config_dmadevs(cfg) < 0)
>  			goto out;
>  
> @@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	for (i = 0; i < nb_workers; i++) {
>  		lcore_id = ldm->lcores[i];
> +		if (lcore_id == mcore_id) {
> +			printf("lcore parameters can not use main core id %d\n", mcore_id);
> +			goto out;
> +		}
> +
> +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
> +			printf("lcore parameters can not use offline core id %d\n", lcore_id);
> +			goto out;
> +		}
The above two judgement should in a seperate commit.
> +
>  		offset = nr_buf / nb_workers * i;
>  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
>  		if (lcores[i] == NULL) {
>  			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
>  			break;
>  		}
> -		if (is_dma) {
> +		if (cfg->is_dma) {
>  			lcores[i]->dma_name = ldm->dma_names[i];
>  			lcores[i]->dev_id = ldm->dma_ids[i];
>  			lcores[i]->kick_batch = kick_batch;
> @@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  		lcores[i]->scenario_id = cfg->scenario_id;
>  		lcores[i]->lcore_id = lcore_id;
>  
> -		if (is_dma)
> -			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
> -		else
> +		if (cfg->is_sg) {
> +			lcores[i]->src_ptrs = cfg->src_ptrs;
> +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
> +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
> +			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
> +		}
> +
> +		if (cfg->is_dma) {
> +			if (!cfg->is_sg)
> +				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
> +					lcore_id);
> +			else
> +				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
> +					lcore_id);
> +		} else {
>  			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
> +		}
too many judgement for selecting target function, how about wrap it subfunction:
lcore_function_t get_work_function(struct test_configure *cfg)
then rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]), lcore_id);
>  	}
>  
>  	while (1) {
> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	rte_eal_mp_wait_lcore();
>  
> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> -			   rte_pktmbuf_mtod(dsts[i], void *),
> -			   cfg->buf_size.cur) != 0) {
> -			printf("Copy validation fails for buffer number %d\n", i);
> -			ret = -1;
> -			goto out;
> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> +					rte_pktmbuf_mtod(dsts[i], void *),
> +					cfg->buf_size.cur) != 0) {
> +				printf("Copy validation fails for buffer number %d\n", i);
> +				ret = -1;
> +				goto out;
> +			}
> +		}
> +	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
> +		size_t src_remsz = buf_size % cfg->src_ptrs;
> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> +		size_t src_sz = buf_size / cfg->src_ptrs;
> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> +		uint8_t src[buf_size], dst[buf_size];
> +		uint8_t *sbuf, *dbuf, *ptr;
> +
> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
> +			sbuf = src;
> +			dbuf = dst;
> +			ptr = NULL;
> +
> +			for (j = 0; j < cfg->src_ptrs; j++) {
> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
> +				memcpy(sbuf, ptr, src_sz);
> +				sbuf += src_sz;
> +			}
> +
> +			if (src_remsz)
> +				memcpy(sbuf, ptr + src_sz, src_remsz);
> +
> +			for (j = 0; j < cfg->dst_ptrs; j++) {
> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
> +				memcpy(dbuf, ptr, dst_sz);
> +				dbuf += dst_sz;
> +			}
> +
> +			if (dst_remsz)
> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> +
> +			if (memcmp(src, dst, buf_size) != 0) {
> +				printf("SG Copy validation fails for buffer number %d\n",
> +					i * cfg->src_ptrs);
> +				ret = -1;
> +				goto out;
> +			}
Now I doubt the value of verify, this verify can't find the middle round copy failure,
because as long as the last round copy is successful, the validation will pass.
And adding validatation in every round copy will impact performance.
Also app/test_dmadev already verify data. so I think we should drop the validation commit.
>  		}
>  	}
>  
> @@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
>  			lcores[i]->worker_info.test_cpl,
>  			&memory, &avg_cycles, &bandwidth, &mops);
> -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
> -					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
> -					avg_cycles, buf_size, nr_buf / nb_workers, memory,
> -					bandwidth, mops, is_dma);
> +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
> +			nr_buf / nb_workers, memory, bandwidth, mops);
>  		mops_total += mops;
>  		bandwidth_total += bandwidth;
>  		avg_cycles_total += avg_cycles;
> @@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	rte_mempool_free(dst_pool);
>  	dst_pool = NULL;
>  
> +	/* free sges for mbufs */
> +	rte_free(src_sges);
> +	src_sges = NULL;
> +
> +	rte_free(dst_sges);
> +	dst_sges = NULL;
> +
>  	/* free the worker parameters */
>  	for (i = 0; i < nb_workers; i++) {
>  		rte_free(lcores[i]);
>  		lcores[i] = NULL;
>  	}
>  
> -	if (is_dma) {
> +	if (cfg->is_dma) {
>  		for (i = 0; i < nb_workers; i++) {
>  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
>  			rte_dma_stop(ldm->dma_ids[i]);
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> index 9c8221025e..28f6c9d1db 100644
> --- a/app/test-dma-perf/config.ini
> +++ b/app/test-dma-perf/config.ini
> @@ -38,6 +38,14 @@
>  
>  ; "skip" To skip a test-case set skip to 1.
Please place hese patchset new add entrys' descriptions above the
"; To specify a configuration file, use the "--config" flag followed by the path to the file."
because original config.ini, fist is parameters descriptor, and then program argment descriptor, and last was example.
>  
> +; Parameters to be configured for SG copy:
Parameters for DMA scatter-gather memory copy:
> +; ========================================
Please remove this line
> +; "dma_src_sge" denotes number of source segments.
> +; "dma_dst_sge" denotes number of destination segments.
> +;
> +; For SG copy, both the parameters need to be configured and they are valid only
> +; when type is DMA_MEM_COPY.
For DMA scatter-gather memory copy, the parameters need to be configured and they are valid only
when type is DMA_MEM_COPY.
> +;
>  ; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
>  ; ==================================================================================
Please remove this line
As another commit "Re: [PATCH v2] app/dma-perf: support bi-directional transfer"'s review feedback,
these descriptor should place after
"
; To use DMA for a test, please specify the "lcore_dma" parameter.
; If you have already set the "-l" and "-a" parameters using EAL,
; make sure that the value of "lcore_dma" falls within their range of the values.
; We have to ensure a 1:1 mapping between the core and DMA device.
"
>  ; "direction" denotes the direction of data transfer. It can take 3 values:
> @@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
>  [case2]
> +type=DMA_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +dma_src_sge=4
> +dma_dst_sge=1
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +test_seconds=2
> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> +eal_args=--in-memory --file-prefix=test
> +
> +[case3]
>  skip=1
>  type=DMA_MEM_COPY
>  direction=dev2mem
> @@ -84,7 +107,7 @@ test_seconds=2
>  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
> -[case3]
> +[case4]
>  type=CPU_MEM_COPY
>  mem_size=10
>  buf_size=64,8192,2,MUL
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index df05bcd7df..a27e4c9429 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
>  
>  	switch (case_cfg->test_type) {
>  	case TEST_TYPE_DMA_MEM_COPY:
> -		ret = mem_copy_benchmark(case_cfg, true);
> -		break;
>  	case TEST_TYPE_CPU_MEM_COPY:
> -		ret = mem_copy_benchmark(case_cfg, false);
> +		ret = mem_copy_benchmark(case_cfg);
>  		break;
>  	default:
>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> @@ -365,7 +363,8 @@ load_configs(const char *path)
>  	const char *case_type;
>  	const char *transfer_dir;
>  	const char *lcore_dma;
> -	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
> +	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
> +		*src_ptrs_str, *dst_ptrs_str;
>  	const char *skip;
>  	struct rte_kvargs *kvlist;
>  	const char *vchan_dev;
> @@ -467,6 +466,7 @@ load_configs(const char *path)
>  			rte_kvargs_free(kvlist);
>  		}
>  
> +		test_case->is_dma = is_dma;
>  		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
>  								section_name, "src_numa_node"));
>  		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> @@ -501,6 +501,32 @@ load_configs(const char *path)
>  			} else if (args_nr == 4)
>  				nb_vp++;
>  
> +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
> +								"dma_src_sge");
> +			if (src_ptrs_str != NULL) {
> +				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dma_src_sge"));
> +			}
> +
> +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
> +								"dma_dst_sge");
> +			if (dst_ptrs_str != NULL) {
> +				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dma_dst_sge"));
> +			}
> +
> +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
> +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
Please also check test_case->src_ptrs and test_case->dst_ptrs valid, make sure there are >1 and <=UINT16_MAX
> +				printf("parse dma_src_sge, dma_dst_sge error in case %d.\n",
> +					i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
> +				test_case->is_sg = true;
> +			} else {
> +				test_case->is_sg = false;
the above could simple by: test_case->is_sg = (src_ptrs_str != NULL && dst_ptrs_str != NULL);
> +			}
> +
>  			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
>  			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
>  			if (args_nr < 0) {
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 1123e7524a..baf149b72b 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -53,11 +53,14 @@ struct test_configure {
>  	uint16_t dst_numa_node;
>  	uint16_t opcode;
>  	bool is_dma;
> +	bool is_sg;
>  	struct lcore_dma_map_t lcore_dma_map;
>  	struct test_configure_entry mem_size;
>  	struct test_configure_entry buf_size;
>  	struct test_configure_entry ring_size;
>  	struct test_configure_entry kick_batch;
> +	uint8_t src_ptrs;
> +	uint8_t dst_ptrs;
>  	uint8_t cache_flush;
>  	uint32_t nr_buf;
>  	uint16_t test_secs;
> @@ -66,6 +69,6 @@ struct test_configure {
>  	struct test_vchan_dev_config vchan_dev;
>  };
>  
> -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> +int mem_copy_benchmark(struct test_configure *cfg);
>  
>  #endif /* MAIN_H */
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
  2024-02-28  9:31                 ` fengchengwen
@ 2024-02-29 13:16                   ` Gowrishankar Muthukrishnan
  2024-03-01  2:07                     ` fengchengwen
  0 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-29 13:16 UTC (permalink / raw)
  To: fengchengwen, Amit Prakash Shukla, Cheng Jiang
  Cc: dev, Jerin Jacob, Anoob Joseph, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula
Hi Fengcheng,
> -----Original Message-----
> From: fengchengwen <fengchengwen@huawei.com>
> Sent: Wednesday, February 28, 2024 3:02 PM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>; Cheng Jiang
> <honest.jiang@foxmail.com>
> Cc: dev@dpdk.org; Jerin Jacob <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>
> Subject: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi Gowrishankar,
> 
> On 2024/2/28 2:56, Amit Prakash Shukla wrote:
> > From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> >
> > Add SG copy support.
> >
> > Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> > Acked-by: Anoob Joseph <anoobj@marvell.com>
> > Acked-by: Chengwen Feng <fengchengwen@huawei.com>
> > ---
> > v10:
> > - SG config variables renamed.
> >
> >  app/test-dma-perf/benchmark.c | 278
> > +++++++++++++++++++++++++++++-----
> >  app/test-dma-perf/config.ini  |  25 ++-
> >  app/test-dma-perf/main.c      |  34 ++++-
> >  app/test-dma-perf/main.h      |   5 +-
> >  4 files changed, 300 insertions(+), 42 deletions(-)
> >
> > diff --git a/app/test-dma-perf/benchmark.c
> > b/app/test-dma-perf/benchmark.c index 0047e2f4b8..25ed6fa6d0 100644
> > --- a/app/test-dma-perf/benchmark.c
> > +++ b/app/test-dma-perf/benchmark.c
> > @@ -46,6 +46,10 @@ struct lcore_params {
> >  	uint16_t test_secs;
> >  	struct rte_mbuf **srcs;
> >  	struct rte_mbuf **dsts;
> > +	struct rte_dma_sge *src_sges;
> > +	struct rte_dma_sge *dst_sges;
> > +	uint8_t src_ptrs;
> > +	uint8_t dst_ptrs;
> 
> 1. src/dst_ptrs -> src/dst_nb_sge
Ack.
> 2. How about wrap these four fields as a struct?
Ack.
> 
> >  	volatile struct worker_info worker_info;  };
> >
> > @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf,
> > uint16_t nb_workers, uint16_t te  }
> >
> >  static void
> > -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
> uint16_t ring_size,
> > -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> buf_size, uint32_t nr_buf,
> > -			float memory, float bandwidth, float mops, bool
> is_dma)
> > +output_result(struct test_configure *cfg, struct lcore_params *para,
> > +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> buf_size,
> > +			uint32_t nr_buf, float memory, float bandwidth, float
> mops)
> >  {
> > -	if (is_dma)
> > -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> %u.\n",
> > -				lcore_id, dma_name, ring_size, kick_batch);
> > -	else
> > +	uint16_t ring_size = cfg->ring_size.cur;
> > +	uint8_t scenario_id = cfg->scenario_id;
> > +	uint32_t lcore_id = para->lcore_id;
> > +	char *dma_name = para->dma_name;
> > +
> > +	if (cfg->is_dma) {
> > +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> %u", lcore_id,
> > +		       dma_name, ring_size, kick_batch);
> > +		if (cfg->is_sg)
> > +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> > +			       para->src_ptrs, para->dst_ptrs);
> 
> DMA src sges: %u DMA dst sges: %u
> 
> I think we should add a column which title maybe misc, some like sg-src[4]-
> dst[1], and later we may add fill test, then this field could be pattern-
> 0x12345678
> 
> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit, if
> the DMA was worked in non-mem2mem direction, we could add simple
> descriptor of direction and pcie.info in the above misc column.
> 
I am sorry, I could not understand complete picture here. Do you mean we 
reserve a column and use it as per test type.
For plain mem copy, nothing added.
For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print "sg-src[1]-dst[4]".
In future, when we add fill test in benchmark, this line instead be "pattern-0x12345678".
Is my understanding correct over here ?
> > +		printf(".\n");
> > +	} else {
> >  		printf("lcore %u\n", lcore_id);
> > +	}
> >
> >  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer
> Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
> >  			ave_cycle, buf_size, nr_buf, memory,
> rte_get_timer_hz()/1000000000.0);
> >  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth,
> > mops);
> >
> > -	if (is_dma)
> > +	if (cfg->is_dma)
> >  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> CSV_LINE_DMA_FMT,
> >  			scenario_id, lcore_id, dma_name, ring_size,
> kick_batch, buf_size,
> >  			nr_buf, memory, ave_cycle, bandwidth, mops); @@ -
> 167,7 +181,7 @@
> > vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
> >
> >  /* Configuration of device. */
> >  static void
> > -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
> > +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg,
> > +uint8_t ptrs_max)
> >  {
> >  	uint16_t vchan = 0;
> >  	struct rte_dma_info info;
> > @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct
> test_configure *cfg)
> >  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported
> on device id. %u\n",
> >  				dev_id);
> >
> > +	if (info.max_sges < ptrs_max)
> > +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported
> by
> > +device id %u.\n",
> 
> "Error with unsupport max_sges on device id %u.\n"
Ack.
> 
> > +				dev_id);
> > +
> >  	if (rte_dma_start(dev_id) != 0)
> >  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");  } @@ -
> 202,8
> > +220,12 @@ config_dmadevs(struct test_configure *cfg)
> >  	uint32_t i;
> >  	int dev_id;
> >  	uint16_t nb_dmadevs = 0;
> > +	uint8_t ptrs_max = 0;
> 
> It hard to understand, how about nb_sge?
Ack. Renamed it to nb_sges.
> 
> >  	char *dma_name;
> >
> > +	if (cfg->is_sg)
> > +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
> > +
> >  	for (i = 0; i < ldm->cnt; i++) {
> >  		dma_name = ldm->dma_names[i];
> >  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> > @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
> >  		}
> >
> >  		ldm->dma_ids[i] = dev_id;
> > -		configure_dmadev_queue(dev_id, cfg);
> > +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
> >  		++nb_dmadevs;
> >  	}
> >
> > @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id,
> uint64_t *async_cnt,
> >  }
> >
> >  static inline int
> > -do_dma_mem_copy(void *p)
> > +do_dma_plain_mem_copy(void *p)
> >  {
> >  	struct lcore_params *para = (struct lcore_params *)p;
> >  	volatile struct worker_info *worker_info = &(para->worker_info);
> > @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
> >  	return 0;
> >  }
> >
> > +static inline int
> > +do_dma_sg_mem_copy(void *p)
> > +{
> > +	struct lcore_params *para = (struct lcore_params *)p;
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	struct rte_dma_sge *src_sges = para->src_sges;
> > +	struct rte_dma_sge *dst_sges = para->dst_sges;
> > +	const uint16_t kick_batch = para->kick_batch;
> > +	const uint8_t src_ptrs = para->src_ptrs;
> > +	const uint8_t dst_ptrs = para->dst_ptrs;
> > +	const uint16_t dev_id = para->dev_id;
> > +	uint32_t nr_buf = para->nr_buf;
> > +	uint64_t async_cnt = 0;
> > +	uint32_t poll_cnt = 0;
> > +	uint16_t nr_cpl;
> > +	uint32_t i, j;
> > +	int ret;
> > +
> > +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		j = 0;
> > +		for (i = 0; i < nr_buf; i++) {
> > +dma_copy:
> > +			ret = rte_dma_copy_sg(dev_id, 0,
> > +				&src_sges[i * src_ptrs], &dst_sges[j *
> dst_ptrs],
> > +				src_ptrs, dst_ptrs, 0);
> > +			if (unlikely(ret < 0)) {
> > +				if (ret == -ENOSPC) {
> > +					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> > +					goto dma_copy;
> > +				} else
> > +					error_exit(dev_id);
> > +			}
> > +			async_cnt++;
> > +			j++;
> > +
> > +			if ((async_cnt % kick_batch) == 0)
> > +				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> > +		}
> > +
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	rte_dma_submit(dev_id, 0);
> > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
> NULL, NULL);
> > +		async_cnt -= nr_cpl;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static inline int
> >  do_cpu_mem_copy(void *p)
> >  {
> > @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
> >  }
> >
> >  static int
> > -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> > -			struct rte_mbuf ***dsts)
> > +setup_memory_env(struct test_configure *cfg,
> > +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
> > +			 struct rte_dma_sge **src_sges, struct rte_dma_sge
> **dst_sges)
> >  {
> >  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
> >  	unsigned int cur_buf_size = cfg->buf_size.cur;
> > @@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg,
> struct rte_mbuf ***srcs,
> >  	}
> >
> >  	for (i = 0; i < nr_buf; i++) {
> > -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
> buf_size);
> > -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> > +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
> cur_buf_size);
> > +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0,
> cur_buf_size);
> >  	}
> >
> >  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
> > @@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg,
> struct rte_mbuf ***srcs,
> >  		}
> >  	}
> >
> > +	if (cfg->is_sg) {
> > +		uint8_t src_ptrs = cfg->src_ptrs;
> > +		uint8_t dst_ptrs = cfg->dst_ptrs;
> > +		uint32_t sglen_src, sglen_dst;
> > +
> > +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
> rte_dma_sge),
> > +					RTE_CACHE_LINE_SIZE);
> > +		if (*src_sges == NULL) {
> > +			printf("Error: src_sges array malloc failed.\n");
> > +			return -1;
> > +		}
> > +
> > +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
> rte_dma_sge),
> > +					RTE_CACHE_LINE_SIZE);
> > +		if (*dst_sges == NULL) {
> > +			printf("Error: dst_sges array malloc failed.\n");
> > +			return -1;
> > +		}
> > +
> > +		sglen_src = cur_buf_size / src_ptrs;
> > +		sglen_dst = cur_buf_size / dst_ptrs;
> > +
> > +		for (i = 0; i < nr_buf; i++) {
> > +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
> > +			(*src_sges)[i].length = sglen_src;
> > +			if (!((i+1) % src_ptrs))
> > +				(*src_sges)[i].length += (cur_buf_size %
> src_ptrs);
> > +
> > +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
> > +			(*dst_sges)[i].length = sglen_dst;
> > +			if (!((i+1) % dst_ptrs))
> > +				(*dst_sges)[i].length += (cur_buf_size %
> dst_ptrs);
> > +		}
> > +	}
> > +
> >  	return 0;
> >  }
> >
> >  int
> > -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
> > +mem_copy_benchmark(struct test_configure *cfg)
> >  {
> > -	uint32_t i;
> > +	uint32_t i, j;
> >  	uint32_t offset;
> >  	unsigned int lcore_id = 0;
> >  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
> > +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
> >  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	const uint32_t mcore_id = rte_get_main_lcore();
> >  	unsigned int buf_size = cfg->buf_size.cur;
> >  	uint16_t kick_batch = cfg->kick_batch.cur;
> > -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> >  	uint16_t nb_workers = ldm->cnt;
> >  	uint16_t test_secs = cfg->test_secs;
> >  	float memory = 0;
> > @@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >  	uint32_t avg_cycles_total;
> >  	float mops, mops_total;
> >  	float bandwidth, bandwidth_total;
> > +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
> > +	uint32_t nr_buf;
> >  	int ret = 0;
> >
> > -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +	/* Align number of buffers according to workers count */
> > +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> > +	nr_buf -= (nr_buf % nb_workers);
> > +	if (cfg->is_sg) {
> > +		nr_buf /= nb_workers;
> > +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
> > +		nr_buf *= nb_workers;
> > +
> > +		if (cfg->dst_ptrs > cfg->src_ptrs) {
> > +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
> > +			nr_sgdst = nr_buf;
> > +		} else {
> > +			nr_sgsrc = nr_buf;
> > +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
> > +		}
> > +	}
> 
> pls move to a subfunction
Ack.
> 
> > +
> > +	cfg->nr_buf = nr_buf;
> > +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
> >  		goto out;
> >
> > -	if (is_dma)
> > +	if (cfg->is_dma)
> >  		if (config_dmadevs(cfg) < 0)
> >  			goto out;
> >
> > @@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >
> >  	for (i = 0; i < nb_workers; i++) {
> >  		lcore_id = ldm->lcores[i];
> > +		if (lcore_id == mcore_id) {
> > +			printf("lcore parameters can not use main core id
> %d\n", mcore_id);
> > +			goto out;
> > +		}
> > +
> > +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
> > +			printf("lcore parameters can not use offline core id
> %d\n", lcore_id);
> > +			goto out;
> > +		}
> 
> The above two judgement should in a seperate commit.
Sorry, somehow it got mixed from different patch I had in my local repo.
It will be in different commit.
> 
> > +
> >  		offset = nr_buf / nb_workers * i;
> >  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
> >  		if (lcores[i] == NULL) {
> >  			printf("lcore parameters malloc failure for lcore %d\n",
> lcore_id);
> >  			break;
> >  		}
> > -		if (is_dma) {
> > +		if (cfg->is_dma) {
> >  			lcores[i]->dma_name = ldm->dma_names[i];
> >  			lcores[i]->dev_id = ldm->dma_ids[i];
> >  			lcores[i]->kick_batch = kick_batch;
> > @@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >  		lcores[i]->scenario_id = cfg->scenario_id;
> >  		lcores[i]->lcore_id = lcore_id;
> >
> > -		if (is_dma)
> > -			rte_eal_remote_launch(do_dma_mem_copy, (void
> *)(lcores[i]), lcore_id);
> > -		else
> > +		if (cfg->is_sg) {
> > +			lcores[i]->src_ptrs = cfg->src_ptrs;
> > +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
> > +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers
> * i);
> > +			lcores[i]->dst_sges = dst_sges + (nr_sgdst /
> nb_workers * i);
> > +		}
> > +
> > +		if (cfg->is_dma) {
> > +			if (!cfg->is_sg)
> > +
> 	rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
> > +					lcore_id);
> > +			else
> > +
> 	rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
> > +					lcore_id);
> > +		} else {
> >  			rte_eal_remote_launch(do_cpu_mem_copy, (void
> *)(lcores[i]), lcore_id);
> > +		}
> 
> too many judgement for selecting target function, how about wrap it
> subfunction:
> lcore_function_t get_work_function(struct test_configure *cfg)
> then rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]),
> lcore_id);
> 
Ack.
> >  	}
> >
> >  	while (1) {
> > @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >
> >  	rte_eal_mp_wait_lcore();
> >
> > -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> > -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> > -			   rte_pktmbuf_mtod(dsts[i], void *),
> > -			   cfg->buf_size.cur) != 0) {
> > -			printf("Copy validation fails for buffer number %d\n",
> i);
> > -			ret = -1;
> > -			goto out;
> > +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
> {
> > +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> > +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> > +					rte_pktmbuf_mtod(dsts[i], void *),
> > +					cfg->buf_size.cur) != 0) {
> > +				printf("Copy validation fails for buffer number
> %d\n", i);
> > +				ret = -1;
> > +				goto out;
> > +			}
> > +		}
> > +	} else if (cfg->is_sg && cfg->transfer_dir ==
> RTE_DMA_DIR_MEM_TO_MEM) {
> > +		size_t src_remsz = buf_size % cfg->src_ptrs;
> > +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> > +		size_t src_sz = buf_size / cfg->src_ptrs;
> > +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> > +		uint8_t src[buf_size], dst[buf_size];
> > +		uint8_t *sbuf, *dbuf, *ptr;
> > +
> > +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
> i++) {
> > +			sbuf = src;
> > +			dbuf = dst;
> > +			ptr = NULL;
> > +
> > +			for (j = 0; j < cfg->src_ptrs; j++) {
> > +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
> + j], uint8_t *);
> > +				memcpy(sbuf, ptr, src_sz);
> > +				sbuf += src_sz;
> > +			}
> > +
> > +			if (src_remsz)
> > +				memcpy(sbuf, ptr + src_sz, src_remsz);
> > +
> > +			for (j = 0; j < cfg->dst_ptrs; j++) {
> > +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
> + j], uint8_t *);
> > +				memcpy(dbuf, ptr, dst_sz);
> > +				dbuf += dst_sz;
> > +			}
> > +
> > +			if (dst_remsz)
> > +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> > +
> > +			if (memcmp(src, dst, buf_size) != 0) {
> > +				printf("SG Copy validation fails for buffer
> number %d\n",
> > +					i * cfg->src_ptrs);
> > +				ret = -1;
> > +				goto out;
> > +			}
> 
> Now I doubt the value of verify, this verify can't find the middle round copy
> failure,
> because as long as the last round copy is successful, the validation will pass.
> 
Validation is on entire buffer. If any middle copy is a failure, entire memcmp
would have failed. Or do I miss something ?
> And adding validatation in every round copy will impact performance.
> 
This validation is just after worker function is stopped measuring perf.
How would this impact performance ?
> Also app/test_dmadev already verify data. so I think we should drop the
> validation commit.
Even in some corner cases or unknown issues, copy would have failed
and taking perf cycles then is meaningless. That is the reason, this validation
is added after perf function doing its job.
> 
> >  		}
> >  	}
> >
> > @@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg,
> bool is_dma)
> >  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> >  			lcores[i]->worker_info.test_cpl,
> >  			&memory, &avg_cycles, &bandwidth, &mops);
> > -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
> > -					lcores[i]->dma_name, cfg-
> >ring_size.cur, kick_batch,
> > -					avg_cycles, buf_size, nr_buf /
> nb_workers, memory,
> > -					bandwidth, mops, is_dma);
> > +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
> > +			nr_buf / nb_workers, memory, bandwidth, mops);
> >  		mops_total += mops;
> >  		bandwidth_total += bandwidth;
> >  		avg_cycles_total += avg_cycles;
> > @@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >  	rte_mempool_free(dst_pool);
> >  	dst_pool = NULL;
> >
> > +	/* free sges for mbufs */
> > +	rte_free(src_sges);
> > +	src_sges = NULL;
> > +
> > +	rte_free(dst_sges);
> > +	dst_sges = NULL;
> > +
> >  	/* free the worker parameters */
> >  	for (i = 0; i < nb_workers; i++) {
> >  		rte_free(lcores[i]);
> >  		lcores[i] = NULL;
> >  	}
> >
> > -	if (is_dma) {
> > +	if (cfg->is_dma) {
> >  		for (i = 0; i < nb_workers; i++) {
> >  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> >  			rte_dma_stop(ldm->dma_ids[i]);
> > diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> > index 9c8221025e..28f6c9d1db 100644
> > --- a/app/test-dma-perf/config.ini
> > +++ b/app/test-dma-perf/config.ini
> > @@ -38,6 +38,14 @@
> >
> >  ; "skip" To skip a test-case set skip to 1.
> 
> Please place hese patchset new add entrys' descriptions above the
> "; To specify a configuration file, use the "--config" flag followed by the path to
> the file."
> 
> because original config.ini, fist is parameters descriptor, and then program
> argment descriptor, and last was example.
> 
Ack.
> >
> > +; Parameters to be configured for SG copy:
> 
> Parameters for DMA scatter-gather memory copy:
> 
Ack.
> > +; ========================================
> 
> Please remove this line
> 
Ack.
> > +; "dma_src_sge" denotes number of source segments.
> > +; "dma_dst_sge" denotes number of destination segments.
> > +;
> > +; For SG copy, both the parameters need to be configured and they are valid
> only
> > +; when type is DMA_MEM_COPY.
> 
> For DMA scatter-gather memory copy, the parameters need to be configured
> and they are valid only
> when type is DMA_MEM_COPY.
> 
Ack.
> > +;
> >  ; Parameters to be configured for data transfers from "mem to dev" and
> "dev to mem":
> >  ;
> ===================================================================
> ===============
> 
> Please remove this line
> 
> As another commit "Re: [PATCH v2] app/dma-perf: support bi-directional
> transfer"'s review feedback,
> these descriptor should place after
> "
> ; To use DMA for a test, please specify the "lcore_dma" parameter.
> ; If you have already set the "-l" and "-a" parameters using EAL,
> ; make sure that the value of "lcore_dma" falls within their range of the values.
> ; We have to ensure a 1:1 mapping between the core and DMA device.
> "
> 
> 
> >  ; "direction" denotes the direction of data transfer. It can take 3 values:
> > @@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2,
> lcore11@0000:00:04.3
> >  eal_args=--in-memory --file-prefix=test
> >
> >  [case2]
> > +type=DMA_MEM_COPY
> > +mem_size=10
> > +buf_size=64,8192,2,MUL
> > +dma_ring_size=1024
> > +dma_src_sge=4
> > +dma_dst_sge=1
> > +kick_batch=32
> > +src_numa_node=0
> > +dst_numa_node=0
> > +cache_flush=0
> > +test_seconds=2
> > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > +eal_args=--in-memory --file-prefix=test
> > +
> > +[case3]
> >  skip=1
> >  type=DMA_MEM_COPY
> >  direction=dev2mem
> > @@ -84,7 +107,7 @@ test_seconds=2
> >  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> >  eal_args=--in-memory --file-prefix=test
> >
> > -[case3]
> > +[case4]
> >  type=CPU_MEM_COPY
> >  mem_size=10
> >  buf_size=64,8192,2,MUL
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > index df05bcd7df..a27e4c9429 100644
> > --- a/app/test-dma-perf/main.c
> > +++ b/app/test-dma-perf/main.c
> > @@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
> >
> >  	switch (case_cfg->test_type) {
> >  	case TEST_TYPE_DMA_MEM_COPY:
> > -		ret = mem_copy_benchmark(case_cfg, true);
> > -		break;
> >  	case TEST_TYPE_CPU_MEM_COPY:
> > -		ret = mem_copy_benchmark(case_cfg, false);
> > +		ret = mem_copy_benchmark(case_cfg);
> >  		break;
> >  	default:
> >  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> > @@ -365,7 +363,8 @@ load_configs(const char *path)
> >  	const char *case_type;
> >  	const char *transfer_dir;
> >  	const char *lcore_dma;
> > -	const char *mem_size_str, *buf_size_str, *ring_size_str,
> *kick_batch_str;
> > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> *kick_batch_str,
> > +		*src_ptrs_str, *dst_ptrs_str;
> >  	const char *skip;
> >  	struct rte_kvargs *kvlist;
> >  	const char *vchan_dev;
> > @@ -467,6 +466,7 @@ load_configs(const char *path)
> >  			rte_kvargs_free(kvlist);
> >  		}
> >
> > +		test_case->is_dma = is_dma;
> >  		test_case->src_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> >
> 	section_name, "src_numa_node"));
> >  		test_case->dst_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > @@ -501,6 +501,32 @@ load_configs(const char *path)
> >  			} else if (args_nr == 4)
> >  				nb_vp++;
> >
> > +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > +
> 	"dma_src_sge");
> > +			if (src_ptrs_str != NULL) {
> > +				test_case->src_ptrs =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +
> 	section_name, "dma_src_sge"));
> > +			}
> > +
> > +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > +
> 	"dma_dst_sge");
> > +			if (dst_ptrs_str != NULL) {
> > +				test_case->dst_ptrs =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +
> 	section_name, "dma_dst_sge"));
> > +			}
> > +
> > +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
> > +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
> 
> Please also check test_case->src_ptrs and test_case->dst_ptrs valid, make sure
> there are >1 and <=UINT16_MAX
At present, this is uint8_t. Do we need it more than UINT8_MAX ?
> 
> > +				printf("parse dma_src_sge, dma_dst_sge error
> in case %d.\n",
> > +					i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL)
> {
> > +				test_case->is_sg = true;
> > +			} else {
> > +				test_case->is_sg = false;
> 
> the above could simple by: test_case->is_sg = (src_ptrs_str != NULL &&
> dst_ptrs_str != NULL);
> 
Added check for nb_ validation here. Please check in next version of patch.
> > +			}
> > +
> >  			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
> section_name, "kick_batch");
> >  			args_nr = parse_entry(kick_batch_str, &test_case-
> >kick_batch);
> >  			if (args_nr < 0) {
> > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> > index 1123e7524a..baf149b72b 100644
> > --- a/app/test-dma-perf/main.h
> > +++ b/app/test-dma-perf/main.h
> > @@ -53,11 +53,14 @@ struct test_configure {
> >  	uint16_t dst_numa_node;
> >  	uint16_t opcode;
> >  	bool is_dma;
> > +	bool is_sg;
> >  	struct lcore_dma_map_t lcore_dma_map;
> >  	struct test_configure_entry mem_size;
> >  	struct test_configure_entry buf_size;
> >  	struct test_configure_entry ring_size;
> >  	struct test_configure_entry kick_batch;
> > +	uint8_t src_ptrs;
> > +	uint8_t dst_ptrs;
> >  	uint8_t cache_flush;
> >  	uint32_t nr_buf;
> >  	uint16_t test_secs;
> > @@ -66,6 +69,6 @@ struct test_configure {
> >  	struct test_vchan_dev_config vchan_dev;
> >  };
> >
> > -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> > +int mem_copy_benchmark(struct test_configure *cfg);
> >
> >  #endif /* MAIN_H */
> >
Thank you for your review. Please confirm if there are any other changes
and I hope next version goes through 😊
Regards,
Gowrishankar
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [v11 0/4] PCI Dev and SG copy support
  2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
                                   ` (2 preceding siblings ...)
  2024-02-27 18:35                 ` [PATCH v10 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
@ 2024-02-29 13:48                 ` Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
                                     ` (4 more replies)
  3 siblings, 5 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-29 13:48 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng, Jerin,
	Gowrishankar Muthukrishnan
Improve dma-perf application to support PCI dev and SG copy,
along with additional supports as below:
 - validate copied memory
 - skip tests if not opted.
v11:
- Review suggestions.
Gowrishankar Muthukrishnan (4):
  app/dma-perf: add skip support
  app/dma-perf: add PCI device support
  app/dma-perf: validate copied memory
  app/dma-perf: add SG copy support
 app/test-dma-perf/benchmark.c | 413 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  56 +++++
 app/test-dma-perf/main.c      | 178 ++++++++++++---
 app/test-dma-perf/main.h      |  13 +-
 4 files changed, 595 insertions(+), 65 deletions(-)
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [v11 1/4] app/dma-perf: add skip support
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
@ 2024-02-29 13:48                   ` Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
                                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-29 13:48 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng, Jerin,
	Gowrishankar Muthukrishnan
Add support to skip running a dma-perf test-case.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v11:
 - config file formatting
 app/test-dma-perf/config.ini |  2 ++
 app/test-dma-perf/main.c     | 48 ++++++++++++++++++++++--------------
 app/test-dma-perf/main.h     |  1 +
 3 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index b550f4b23f..bb0b1aa11a 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -30,6 +30,8 @@
 ; If you have already set the "-l" and "-a" parameters using EAL,
 ; make sure that the value of "lcore" falls within their range of values.
 
+; "skip" To skip a test-case set skip to 1.
+
 ; To specify a configuration file, use the "--config" flag followed by the path to the file.
 
 ; To specify a result file, use the "--result" flag followed by the path to the file.
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 544784df50..e9e40e72e7 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -86,6 +86,19 @@ output_header(uint32_t case_id, struct test_configure *case_cfg)
 	output_csv(true);
 }
 
+static int
+open_output_csv(const char *rst_path_ptr)
+{
+	fd = fopen(rst_path_ptr, "a");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 1;
+	}
+	output_csv(true);
+	fclose(fd);
+	return 0;
+}
+
 static void
 run_test_case(struct test_configure *case_cfg)
 {
@@ -322,6 +335,7 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *skip;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -341,6 +355,13 @@ load_configs(const char *path)
 	for (i = 0; i < nb_sections; i++) {
 		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
 		test_case = &test_cases[i];
+
+		skip = rte_cfgfile_get_entry(cfgfile, section_name, "skip");
+		if (skip && (atoi(skip) == 1)) {
+			test_case->is_skip = true;
+			continue;
+		}
+
 		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
 		if (case_type == NULL) {
 			printf("Error: No case type in case %d, the test will be finished here.\n",
@@ -525,31 +546,20 @@ main(int argc, char *argv[])
 
 	printf("Running cases...\n");
 	for (i = 0; i < case_nb; i++) {
-		if (!test_cases[i].is_valid) {
-			printf("Invalid test case %d.\n\n", i + 1);
-			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+		if (test_cases[i].is_skip) {
+			printf("Test case %d configured to be skipped.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Skip the test-case %d\n",
+				 i + 1);
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
-		if (test_cases[i].test_type == TEST_TYPE_NONE) {
-			printf("No valid test type in test case %d.\n\n", i + 1);
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
 			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
-
-			fd = fopen(rst_path_ptr, "a");
-			if (!fd) {
-				printf("Open output CSV file error.\n");
+			if (open_output_csv(rst_path_ptr))
 				return 0;
-			}
-			output_csv(true);
-			fclose(fd);
 			continue;
 		}
 
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 62085e6e8f..32670151af 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -40,6 +40,7 @@ struct lcore_dma_map_t {
 
 struct test_configure {
 	bool is_valid;
+	bool is_skip;
 	uint8_t test_type;
 	const char *test_type_str;
 	uint16_t src_numa_node;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [v11 2/4] app/dma-perf: add PCI device support
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
@ 2024-02-29 13:48                   ` Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
                                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-29 13:48 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng, Jerin,
	Gowrishankar Muthukrishnan
Add support to test performance for "device to memory" and
"memory to device" data transfer.
Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v11:
 - config file formatting.
 app/test-dma-perf/benchmark.c | 119 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  31 +++++++++
 app/test-dma-perf/main.c      |  77 ++++++++++++++++++++++
 app/test-dma-perf/main.h      |   7 ++
 4 files changed, 222 insertions(+), 12 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..3c4fddb138 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -127,17 +127,54 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
 #endif
 }
 
+static int
+vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
+		    struct test_configure *cfg)
+{
+	struct rte_dma_info info;
+
+	qconf->direction = cfg->transfer_dir;
+
+	rte_dma_info_get(dev_id, &info);
+	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
+		return -1;
+
+	qconf->nb_desc = cfg->ring_size.cur;
+
+	switch (qconf->direction) {
+	case RTE_DMA_DIR_MEM_TO_DEV:
+		qconf->dst_port.pcie.vfen = 1;
+		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->dst_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
+		qconf->dst_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
+		qconf->dst_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		break;
+	case RTE_DMA_DIR_DEV_TO_MEM:
+		qconf->src_port.pcie.vfen = 1;
+		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
+		qconf->src_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
+		qconf->src_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
+		qconf->src_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		break;
+	case RTE_DMA_DIR_MEM_TO_MEM:
+	case RTE_DMA_DIR_DEV_TO_DEV:
+		break;
+	}
+
+	return 0;
+}
+
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
-	struct rte_dma_vchan_conf qconf = {
-		.direction = RTE_DMA_DIR_MEM_TO_MEM,
-		.nb_desc = ring_size
-	};
+	struct rte_dma_vchan_conf qconf = { 0 };
+
+	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
@@ -159,7 +196,6 @@ configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	uint32_t ring_size = cfg->ring_size.cur;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	uint32_t nb_workers = ldm->cnt;
 	uint32_t i;
@@ -176,7 +212,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, ring_size);
+		configure_dmadev_queue(dev_id, cfg);
 		++nb_dmadevs;
 	}
 
@@ -302,13 +338,23 @@ do_cpu_mem_copy(void *p)
 	return 0;
 }
 
+static void
+dummy_free_ext_buf(void *addr, void *opaque)
+{
+	RTE_SET_USED(addr);
+	RTE_SET_USED(opaque);
+}
+
 static int
 setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			struct rte_mbuf ***dsts)
 {
-	unsigned int buf_size = cfg->buf_size.cur;
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
+	unsigned int cur_buf_size = cfg->buf_size.cur;
+	unsigned int buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
+	uint32_t i;
 
 	nr_sockets = rte_socket_count();
 	if (cfg->src_numa_node >= nr_sockets ||
@@ -321,7 +367,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->src_numa_node);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
@@ -332,7 +378,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->dst_numa_node);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
@@ -361,16 +407,51 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
+	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+		if (ext_buf_info == NULL) {
+			printf("Error: ext_buf_info malloc failed.\n");
+			return -1;
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i],
+				(void *)(cfg->vchan_dev.raddr + (i * buf_size)),
+				(rte_iova_t)(cfg->vchan_dev.raddr + (i * buf_size)),
+				0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*dsts)[i],
+				(void *)(cfg->vchan_dev.raddr + (i * buf_size)),
+				(rte_iova_t)(cfg->vchan_dev.raddr + (i * buf_size)),
+				0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
 	return 0;
 }
 
 void
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
-	uint16_t i;
+	uint32_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -476,6 +557,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
 
 out:
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+		m = srcs;
+	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
+		m = dsts;
+
+	if (m) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_detach_extbuf(m[i]);
+
+		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+			rte_free(m[0]->shinfo);
+	}
+
 	/* free mbufs used in the test */
 	if (srcs != NULL)
 		rte_pktmbuf_free_bulk(srcs, nr_buf);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index bb0b1aa11a..ada0146b92 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -32,6 +32,21 @@
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters for data transfers from "mem to dev" and "dev to mem":
+;
+; "direction" denotes the direction of data transfer. It can take 3 values:
+;    mem2mem - mem to mem transfer
+;    mem2dev - mem to dev transfer
+;    dev2mem - dev to mem transfer
+; If not specified the default value is mem2mem transfer.
+
+; "vchan_dev" denotes below comma separated bus related parameters for mem2dev and dev2mem dma transfer.
+;    "raddr" remote iova address for mem2dev and dev2mem transfer.
+;    "coreid" denotes PCIe core index.
+;    "pfid" denotes PF-id to be used for data transfer
+;    "vfid" denotes VF-id of PF-id to be used for data transfer.
+;    Example: vchan_dev=raddr=0x400000,coreid=1,pfid=2,vfid=3
+
 ; To specify a configuration file, use the "--config" flag followed by the path to the file.
 
 ; To specify a result file, use the "--result" flag followed by the path to the file.
@@ -52,6 +67,22 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+skip=1
+type=DMA_MEM_COPY
+direction=dev2mem
+vchan_dev=raddr=0x200000000,coreid=1,pfid=2,vfid=3
+mem_size=10
+buf_size=64,4096,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e9e40e72e7..051f76a6f9 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -16,6 +16,8 @@
 #include <rte_cfgfile.h>
 #include <rte_string_fns.h>
 #include <rte_lcore.h>
+#include <rte_dmadev.h>
+#include <rte_kvargs.h>
 
 #include "main.h"
 
@@ -325,6 +327,28 @@ parse_entry(const char *value, struct test_configure_entry *entry)
 	return args_nr;
 }
 
+static int populate_pcie_config(const char *key, const char *value, void *test)
+{
+	struct test_configure *test_case = (struct test_configure *)test;
+	char *endptr;
+	int ret = 0;
+
+	if (strcmp(key, "raddr") == 0)
+		test_case->vchan_dev.raddr = strtoull(value, &endptr, 16);
+	else if (strcmp(key, "coreid") == 0)
+		test_case->vchan_dev.port.pcie.coreid = (uint8_t)atoi(value);
+	else if (strcmp(key, "vfid") == 0)
+		test_case->vchan_dev.port.pcie.vfid = (uint16_t)atoi(value);
+	else if (strcmp(key, "pfid") == 0)
+		test_case->vchan_dev.port.pcie.pfid = (uint16_t)atoi(value);
+	else {
+		printf("Invalid config param: %s\n", key);
+		ret = -1;
+	}
+
+	return ret;
+}
+
 static uint16_t
 load_configs(const char *path)
 {
@@ -333,9 +357,12 @@ load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
+	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
 	const char *skip;
+	struct rte_kvargs *kvlist;
+	const char *vchan_dev;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -373,6 +400,22 @@ load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
+
+			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
+			if (transfer_dir == NULL) {
+				printf("Transfer direction not configured."
+					" Defaulting it to MEM to MEM transfer.\n");
+				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+			} else {
+				if (strcmp(transfer_dir, "mem2dev") == 0)
+					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_DEV;
+				else if (strcmp(transfer_dir, "dev2mem") == 0)
+					test_case->transfer_dir = RTE_DMA_DIR_DEV_TO_MEM;
+				else {
+					printf("Defaulting the test to MEM to MEM transfer\n");
+					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
+				}
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -384,6 +427,40 @@ load_configs(const char *path)
 			continue;
 		}
 
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
+			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+			vchan_dev = rte_cfgfile_get_entry(cfgfile, section_name, "vchan_dev");
+			if (vchan_dev == NULL) {
+				printf("Transfer direction mem2dev and dev2mem"
+				       " vhcan_dev shall be configured.\n");
+				test_case->is_valid = false;
+				continue;
+			}
+
+			kvlist = rte_kvargs_parse(vchan_dev, NULL);
+			if (kvlist == NULL) {
+				printf("rte_kvargs_parse() error");
+				test_case->is_valid = false;
+				continue;
+			}
+
+			if (rte_kvargs_process(kvlist, NULL, populate_pcie_config,
+					       (void *)test_case) < 0) {
+				printf("rte_kvargs_process() error\n");
+				rte_kvargs_free(kvlist);
+				test_case->is_valid = false;
+				continue;
+			}
+
+			if (!test_case->vchan_dev.raddr) {
+				printf("For mem2dev and dev2mem configure raddr\n");
+				rte_kvargs_free(kvlist);
+				test_case->is_valid = false;
+				continue;
+			}
+			rte_kvargs_free(kvlist);
+		}
+
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 32670151af..745c24b7fe 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -38,10 +38,16 @@ struct lcore_dma_map_t {
 	uint16_t cnt;
 };
 
+struct test_vchan_dev_config {
+	struct rte_dma_port_param port;
+	uintptr_t raddr;
+};
+
 struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
+	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
@@ -57,6 +63,7 @@ struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
+	struct test_vchan_dev_config vchan_dev;
 };
 
 void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [v11 3/4] app/dma-perf: validate copied memory
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
@ 2024-02-29 13:48                   ` Gowrishankar Muthukrishnan
  2024-02-29 13:48                   ` [v11 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
  2024-03-06 19:50                   ` [v11 0/4] PCI Dev and " Thomas Monjalon
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-29 13:48 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng, Jerin,
	Gowrishankar Muthukrishnan
Validate copied memory to ensure DMA copy did not fail.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
 app/test-dma-perf/benchmark.c | 23 ++++++++++++++++++++++-
 app/test-dma-perf/main.c      | 16 +++++++++++-----
 app/test-dma-perf/main.h      |  2 +-
 3 files changed, 34 insertions(+), 7 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 3c4fddb138..9c155a58cc 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -12,6 +12,7 @@
 #include <rte_dmadev.h>
 #include <rte_malloc.h>
 #include <rte_lcore.h>
+#include <rte_random.h>
 
 #include "main.h"
 
@@ -407,6 +408,11 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
+	for (i = 0; i < nr_buf; i++) {
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+	}
+
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
 	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
 		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
@@ -445,7 +451,7 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	return 0;
 }
 
-void
+int
 mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 {
 	uint32_t i;
@@ -463,6 +469,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	int ret = 0;
 
 	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
 		goto out;
@@ -536,6 +543,18 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+				   rte_pktmbuf_mtod(dsts[i], void *),
+				   cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	}
+
 	mops_total = 0;
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
@@ -601,4 +620,6 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			rte_dma_stop(ldm->dma_ids[i]);
 		}
 	}
+
+	return ret;
 }
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 051f76a6f9..df05bcd7df 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -101,20 +101,24 @@ open_output_csv(const char *rst_path_ptr)
 	return 0;
 }
 
-static void
+static int
 run_test_case(struct test_configure *case_cfg)
 {
+	int ret = 0;
+
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		mem_copy_benchmark(case_cfg, true);
+		ret = mem_copy_benchmark(case_cfg, true);
 		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg, false);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
 		break;
 	}
+
+	return ret;
 }
 
 static void
@@ -159,8 +163,10 @@ run_test(uint32_t case_id, struct test_configure *case_cfg)
 		case_cfg->scenario_id++;
 		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
 
-		run_test_case(case_cfg);
-		output_csv(false);
+		if (run_test_case(case_cfg) < 0)
+			printf("\nTest fails! skipping this scenario.\n");
+		else
+			output_csv(false);
 
 		if (var_entry->op == OP_ADD)
 			var_entry->cur += var_entry->incr;
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 745c24b7fe..1123e7524a 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -66,6 +66,6 @@ struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* [v11 4/4] app/dma-perf: add SG copy support
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
                                     ` (2 preceding siblings ...)
  2024-02-29 13:48                   ` [v11 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
@ 2024-02-29 13:48                   ` Gowrishankar Muthukrishnan
  2024-03-06 19:50                   ` [v11 0/4] PCI Dev and " Thomas Monjalon
  4 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-02-29 13:48 UTC (permalink / raw)
  To: dev
  Cc: anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng, Jerin,
	Gowrishankar Muthukrishnan
Add SG copy support.
Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v11:
 - using struct for SGE config.
 app/test-dma-perf/benchmark.c | 283 ++++++++++++++++++++++++++++++----
 app/test-dma-perf/config.ini  |  25 ++-
 app/test-dma-perf/main.c      |  41 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 317 insertions(+), 37 deletions(-)
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9c155a58cc..d821af8532 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -34,6 +34,13 @@ struct worker_info {
 	uint32_t test_cpl;
 };
 
+struct sge_info {
+	struct rte_dma_sge *srcs;
+	struct rte_dma_sge *dsts;
+	uint8_t nb_srcs;
+	uint8_t nb_dsts;
+};
+
 struct lcore_params {
 	uint8_t scenario_id;
 	unsigned int lcore_id;
@@ -46,6 +53,7 @@ struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct sge_info sge;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +94,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src sges: %u, dst sges: %u",
+			       para->sge.nb_srcs, para->sge.nb_dsts);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +185,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t sges_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +208,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < sges_max)
+		rte_exit(EXIT_FAILURE, "Error with unsupported max_sges on device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +224,12 @@ config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t nb_sges = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		nb_sges = RTE_MAX(cfg->nb_src_sges, cfg->nb_dst_sges);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +239,7 @@ config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, nb_sges);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +279,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +332,65 @@ do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->sge.srcs;
+	struct rte_dma_sge *dst_sges = para->sge.dsts;
+	const uint8_t nb_src_sges = para->sge.nb_srcs;
+	const uint8_t nb_dst_sges = para->sge.nb_dsts;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(nb_src_sges, nb_dst_sges);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * nb_src_sges], &dst_sges[j * nb_dst_sges],
+				nb_src_sges, nb_dst_sges, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -347,8 +432,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int cur_buf_size = cfg->buf_size.cur;
@@ -409,8 +495,8 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	}
 
 	for (i = 0; i < nr_buf; i++) {
-		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
-		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
 	}
 
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
@@ -448,20 +534,102 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t nb_src_sges = cfg->nb_src_sges;
+		uint8_t nb_dst_sges = cfg->nb_dst_sges;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = cur_buf_size / nb_src_sges;
+		sglen_dst = cur_buf_size / nb_dst_sges;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % nb_src_sges))
+				(*src_sges)[i].length += (cur_buf_size % nb_src_sges);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % nb_dst_sges))
+				(*dst_sges)[i].length += (cur_buf_size % nb_dst_sges);
+		}
+	}
+
 	return 0;
 }
 
+static uint32_t
+align_buffer_count(struct test_configure *cfg, uint32_t *nr_sgsrc, uint32_t *nr_sgdst)
+{
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint16_t nb_workers = ldm->cnt;
+	uint32_t nr_buf;
+
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+
+	if (nr_sgsrc == NULL || nr_sgdst == NULL)
+		return nr_buf;
+
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->nb_src_sges * cfg->nb_dst_sges);
+		nr_buf *= nb_workers;
+
+		if (cfg->nb_dst_sges > cfg->nb_src_sges) {
+			*nr_sgsrc = (nr_buf / cfg->nb_dst_sges * cfg->nb_src_sges);
+			*nr_sgdst = nr_buf;
+		} else {
+			*nr_sgsrc = nr_buf;
+			*nr_sgdst = (nr_buf / cfg->nb_src_sges * cfg->nb_dst_sges);
+		}
+	}
+
+	return nr_buf;
+}
+
+static lcore_function_t *
+get_work_function(struct test_configure *cfg)
+{
+	lcore_function_t *fn;
+
+	if (cfg->is_dma) {
+		if (!cfg->is_sg)
+			fn = do_dma_plain_mem_copy;
+		else
+			fn = do_dma_sg_mem_copy;
+	} else {
+		fn = do_cpu_mem_copy;
+	}
+
+	return fn;
+}
+
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -469,12 +637,17 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	nr_buf = align_buffer_count(cfg, &nr_sgsrc, &nr_sgdst);
+	cfg->nr_buf = nr_buf;
+
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -494,7 +667,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -508,10 +681,15 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
-			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		if (cfg->is_sg) {
+			lcores[i]->sge.nb_srcs = cfg->nb_src_sges;
+			lcores[i]->sge.nb_dsts = cfg->nb_dst_sges;
+			lcores[i]->sge.srcs = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->sge.dsts = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]),
+				lcore_id);
 	}
 
 	while (1) {
@@ -543,7 +721,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM && !cfg->is_sg) {
 		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
 			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
 				   rte_pktmbuf_mtod(dsts[i], void *),
@@ -553,6 +731,44 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 				goto out;
 			}
 		}
+	} else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM && cfg->is_sg) {
+		size_t src_remsz = buf_size % cfg->nb_src_sges;
+		size_t dst_remsz = buf_size % cfg->nb_dst_sges;
+		size_t src_sz = buf_size / cfg->nb_src_sges;
+		size_t dst_sz = buf_size / cfg->nb_dst_sges;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->nb_src_sges, cfg->nb_dst_sges)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->nb_src_sges; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->nb_src_sges + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->nb_dst_sges; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->nb_dst_sges + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->nb_src_sges);
+				ret = -1;
+				goto out;
+			}
+		}
 	}
 
 	mops_total = 0;
@@ -562,10 +778,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -608,13 +822,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index ada0146b92..2a7e2567d3 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -47,6 +47,14 @@
 ;    "vfid" denotes VF-id of PF-id to be used for data transfer.
 ;    Example: vchan_dev=raddr=0x400000,coreid=1,pfid=2,vfid=3
 
+; Parameters for DMA scatter-gather memory copy:
+;
+; "dma_src_sge" denotes number of source segments.
+; "dma_dst_sge" denotes number of destination segments.
+;
+; For DMA scatter-gather memory copy, the parameters need to be configured
+; and they are valid only when type is DMA_MEM_COPY.
+
 ; To specify a configuration file, use the "--config" flag followed by the path to the file.
 
 ; To specify a result file, use the "--result" flag followed by the path to the file.
@@ -67,6 +75,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_src_sge=4
+dma_dst_sge=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=dev2mem
@@ -82,7 +105,7 @@ test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index df05bcd7df..18219918cc 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -365,7 +363,8 @@ load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_sges_str, *dst_sges_str;
 	const char *skip;
 	struct rte_kvargs *kvlist;
 	const char *vchan_dev;
@@ -467,6 +466,7 @@ load_configs(const char *path)
 			rte_kvargs_free(kvlist);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -501,6 +501,39 @@ load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_sges_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_src_sge");
+			if (src_sges_str != NULL) {
+				test_case->nb_src_sges = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_src_sge"));
+			}
+
+			dst_sges_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_dst_sge");
+			if (dst_sges_str != NULL) {
+				test_case->nb_dst_sges = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_dst_sge"));
+			}
+
+			if ((src_sges_str != NULL && dst_sges_str == NULL) ||
+			    (src_sges_str == NULL && dst_sges_str != NULL)) {
+				printf("parse dma_src_sge, dma_dst_sge error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_sges_str != NULL && dst_sges_str != NULL) {
+				test_case->is_sg = true;
+
+				if (test_case->nb_src_sges == 0 || test_case->nb_dst_sges == 0) {
+					printf("dma_src_sge and dma_dst_sge can not be 0 in case %d.\n",
+						i + 1);
+					test_case->is_valid = false;
+					continue;
+				}
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 1123e7524a..e88d72f54f 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -53,11 +53,14 @@ struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t nb_src_sges;
+	uint8_t nb_dst_sges;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -66,6 +69,6 @@ struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
  2024-02-29 13:16                   ` [EXT] " Gowrishankar Muthukrishnan
@ 2024-03-01  2:07                     ` fengchengwen
  2024-03-01  8:06                       ` [EXTERNAL] " Gowrishankar Muthukrishnan
  0 siblings, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-03-01  2:07 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, Amit Prakash Shukla, Cheng Jiang
  Cc: dev, Jerin Jacob, Anoob Joseph, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula
Hi Gowrishankar,
On 2024/2/29 21:16, Gowrishankar Muthukrishnan wrote:
> Hi Fengcheng,
> 
>> -----Original Message-----
>> From: fengchengwen <fengchengwen@huawei.com>
>> Sent: Wednesday, February 28, 2024 3:02 PM
>> To: Amit Prakash Shukla <amitprakashs@marvell.com>; Cheng Jiang
>> <honest.jiang@foxmail.com>
>> Cc: dev@dpdk.org; Jerin Jacob <jerinj@marvell.com>; Anoob Joseph
>> <anoobj@marvell.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
>> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
>> <pbhagavatula@marvell.com>; Gowrishankar Muthukrishnan
>> <gmuthukrishn@marvell.com>
>> Subject: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> Hi Gowrishankar,
>>
>> On 2024/2/28 2:56, Amit Prakash Shukla wrote:
>>> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
>>>
>>> Add SG copy support.
>>>
>>> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
>>> Acked-by: Anoob Joseph <anoobj@marvell.com>
>>> Acked-by: Chengwen Feng <fengchengwen@huawei.com>
>>> ---
>>> v10:
>>> - SG config variables renamed.
>>>
>>>  app/test-dma-perf/benchmark.c | 278
>>> +++++++++++++++++++++++++++++-----
>>>  app/test-dma-perf/config.ini  |  25 ++-
>>>  app/test-dma-perf/main.c      |  34 ++++-
>>>  app/test-dma-perf/main.h      |   5 +-
>>>  4 files changed, 300 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/app/test-dma-perf/benchmark.c
>>> b/app/test-dma-perf/benchmark.c index 0047e2f4b8..25ed6fa6d0 100644
>>> --- a/app/test-dma-perf/benchmark.c
>>> +++ b/app/test-dma-perf/benchmark.c
>>> @@ -46,6 +46,10 @@ struct lcore_params {
>>>  	uint16_t test_secs;
>>>  	struct rte_mbuf **srcs;
>>>  	struct rte_mbuf **dsts;
>>> +	struct rte_dma_sge *src_sges;
>>> +	struct rte_dma_sge *dst_sges;
>>> +	uint8_t src_ptrs;
>>> +	uint8_t dst_ptrs;
>>
>> 1. src/dst_ptrs -> src/dst_nb_sge
> Ack.
> 
>> 2. How about wrap these four fields as a struct?
> Ack.
> 
>>
>>>  	volatile struct worker_info worker_info;  };
>>>
>>> @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf,
>>> uint16_t nb_workers, uint16_t te  }
>>>
>>>  static void
>>> -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
>> uint16_t ring_size,
>>> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>> buf_size, uint32_t nr_buf,
>>> -			float memory, float bandwidth, float mops, bool
>> is_dma)
>>> +output_result(struct test_configure *cfg, struct lcore_params *para,
>>> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>> buf_size,
>>> +			uint32_t nr_buf, float memory, float bandwidth, float
>> mops)
>>>  {
>>> -	if (is_dma)
>>> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>> %u.\n",
>>> -				lcore_id, dma_name, ring_size, kick_batch);
>>> -	else
>>> +	uint16_t ring_size = cfg->ring_size.cur;
>>> +	uint8_t scenario_id = cfg->scenario_id;
>>> +	uint32_t lcore_id = para->lcore_id;
>>> +	char *dma_name = para->dma_name;
>>> +
>>> +	if (cfg->is_dma) {
>>> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>> %u", lcore_id,
>>> +		       dma_name, ring_size, kick_batch);
>>> +		if (cfg->is_sg)
>>> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
>>> +			       para->src_ptrs, para->dst_ptrs);
>>
>> DMA src sges: %u DMA dst sges: %u
>>
>> I think we should add a column which title maybe misc, some like sg-src[4]-
>> dst[1], and later we may add fill test, then this field could be pattern-
>> 0x12345678
>>
>> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit, if
>> the DMA was worked in non-mem2mem direction, we could add simple
>> descriptor of direction and pcie.info in the above misc column.
>>
> 
> I am sorry, I could not understand complete picture here. Do you mean we 
> reserve a column and use it as per test type.
> 
> For plain mem copy, nothing added.
> For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print "sg-src[1]-dst[4]".
> In future, when we add fill test in benchmark, this line instead be "pattern-0x12345678".
> 
> Is my understanding correct over here ?
Yes, some like this.
> 
>>> +		printf(".\n");
>>> +	} else {
>>>  		printf("lcore %u\n", lcore_id);
>>> +	}
>>>
>>>  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer
>> Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
>>>  			ave_cycle, buf_size, nr_buf, memory,
>> rte_get_timer_hz()/1000000000.0);
>>>  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth,
>>> mops);
>>>
>>> -	if (is_dma)
>>> +	if (cfg->is_dma)
>>>  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
>> CSV_LINE_DMA_FMT,
>>>  			scenario_id, lcore_id, dma_name, ring_size,
>> kick_batch, buf_size,
>>>  			nr_buf, memory, ave_cycle, bandwidth, mops); @@ -
>> 167,7 +181,7 @@
>>> vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
>>>
>>>  /* Configuration of device. */
>>>  static void
>>> -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
>>> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg,
>>> +uint8_t ptrs_max)
>>>  {
>>>  	uint16_t vchan = 0;
>>>  	struct rte_dma_info info;
>>> @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct
>> test_configure *cfg)
>>>  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported
>> on device id. %u\n",
>>>  				dev_id);
>>>
>>> +	if (info.max_sges < ptrs_max)
>>> +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported
>> by
>>> +device id %u.\n",
>>
>> "Error with unsupport max_sges on device id %u.\n"
> Ack.
> 
>>
>>> +				dev_id);
>>> +
>>>  	if (rte_dma_start(dev_id) != 0)
>>>  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");  } @@ -
>> 202,8
>>> +220,12 @@ config_dmadevs(struct test_configure *cfg)
>>>  	uint32_t i;
>>>  	int dev_id;
>>>  	uint16_t nb_dmadevs = 0;
>>> +	uint8_t ptrs_max = 0;
>>
>> It hard to understand, how about nb_sge?
> 
> Ack. Renamed it to nb_sges.
>>
>>>  	char *dma_name;
>>>
>>> +	if (cfg->is_sg)
>>> +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
>>> +
>>>  	for (i = 0; i < ldm->cnt; i++) {
>>>  		dma_name = ldm->dma_names[i];
>>>  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
>>> @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
>>>  		}
>>>
>>>  		ldm->dma_ids[i] = dev_id;
>>> -		configure_dmadev_queue(dev_id, cfg);
>>> +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
>>>  		++nb_dmadevs;
>>>  	}
>>>
>>> @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id,
>> uint64_t *async_cnt,
>>>  }
>>>
>>>  static inline int
>>> -do_dma_mem_copy(void *p)
>>> +do_dma_plain_mem_copy(void *p)
>>>  {
>>>  	struct lcore_params *para = (struct lcore_params *)p;
>>>  	volatile struct worker_info *worker_info = &(para->worker_info);
>>> @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
>>>  	return 0;
>>>  }
>>>
>>> +static inline int
>>> +do_dma_sg_mem_copy(void *p)
>>> +{
>>> +	struct lcore_params *para = (struct lcore_params *)p;
>>> +	volatile struct worker_info *worker_info = &(para->worker_info);
>>> +	struct rte_dma_sge *src_sges = para->src_sges;
>>> +	struct rte_dma_sge *dst_sges = para->dst_sges;
>>> +	const uint16_t kick_batch = para->kick_batch;
>>> +	const uint8_t src_ptrs = para->src_ptrs;
>>> +	const uint8_t dst_ptrs = para->dst_ptrs;
>>> +	const uint16_t dev_id = para->dev_id;
>>> +	uint32_t nr_buf = para->nr_buf;
>>> +	uint64_t async_cnt = 0;
>>> +	uint32_t poll_cnt = 0;
>>> +	uint16_t nr_cpl;
>>> +	uint32_t i, j;
>>> +	int ret;
>>> +
>>> +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
>>> +	worker_info->stop_flag = false;
>>> +	worker_info->ready_flag = true;
>>> +
>>> +	while (!worker_info->start_flag)
>>> +		;
>>> +
>>> +	while (1) {
>>> +		j = 0;
>>> +		for (i = 0; i < nr_buf; i++) {
>>> +dma_copy:
>>> +			ret = rte_dma_copy_sg(dev_id, 0,
>>> +				&src_sges[i * src_ptrs], &dst_sges[j *
>> dst_ptrs],
>>> +				src_ptrs, dst_ptrs, 0);
>>> +			if (unlikely(ret < 0)) {
>>> +				if (ret == -ENOSPC) {
>>> +					do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>>> +					goto dma_copy;
>>> +				} else
>>> +					error_exit(dev_id);
>>> +			}
>>> +			async_cnt++;
>>> +			j++;
>>> +
>>> +			if ((async_cnt % kick_batch) == 0)
>>> +				do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>>> +		}
>>> +
>>> +		if (worker_info->stop_flag)
>>> +			break;
>>> +	}
>>> +
>>> +	rte_dma_submit(dev_id, 0);
>>> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
>>> +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
>> NULL, NULL);
>>> +		async_cnt -= nr_cpl;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  static inline int
>>>  do_cpu_mem_copy(void *p)
>>>  {
>>> @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
>>>  }
>>>
>>>  static int
>>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>> -			struct rte_mbuf ***dsts)
>>> +setup_memory_env(struct test_configure *cfg,
>>> +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
>>> +			 struct rte_dma_sge **src_sges, struct rte_dma_sge
>> **dst_sges)
>>>  {
>>>  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
>>>  	unsigned int cur_buf_size = cfg->buf_size.cur;
>>> @@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg,
>> struct rte_mbuf ***srcs,
>>>  	}
>>>
>>>  	for (i = 0; i < nr_buf; i++) {
>>> -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
>> buf_size);
>>> -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
>>> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
>> cur_buf_size);
>>> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0,
>> cur_buf_size);
>>>  	}
>>>
>>>  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
>>> @@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg,
>> struct rte_mbuf ***srcs,
>>>  		}
>>>  	}
>>>
>>> +	if (cfg->is_sg) {
>>> +		uint8_t src_ptrs = cfg->src_ptrs;
>>> +		uint8_t dst_ptrs = cfg->dst_ptrs;
>>> +		uint32_t sglen_src, sglen_dst;
>>> +
>>> +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
>> rte_dma_sge),
>>> +					RTE_CACHE_LINE_SIZE);
>>> +		if (*src_sges == NULL) {
>>> +			printf("Error: src_sges array malloc failed.\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
>> rte_dma_sge),
>>> +					RTE_CACHE_LINE_SIZE);
>>> +		if (*dst_sges == NULL) {
>>> +			printf("Error: dst_sges array malloc failed.\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		sglen_src = cur_buf_size / src_ptrs;
>>> +		sglen_dst = cur_buf_size / dst_ptrs;
>>> +
>>> +		for (i = 0; i < nr_buf; i++) {
>>> +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
>>> +			(*src_sges)[i].length = sglen_src;
>>> +			if (!((i+1) % src_ptrs))
>>> +				(*src_sges)[i].length += (cur_buf_size %
>> src_ptrs);
>>> +
>>> +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
>>> +			(*dst_sges)[i].length = sglen_dst;
>>> +			if (!((i+1) % dst_ptrs))
>>> +				(*dst_sges)[i].length += (cur_buf_size %
>> dst_ptrs);
>>> +		}
>>> +	}
>>> +
>>>  	return 0;
>>>  }
>>>
>>>  int
>>> -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>> +mem_copy_benchmark(struct test_configure *cfg)
>>>  {
>>> -	uint32_t i;
>>> +	uint32_t i, j;
>>>  	uint32_t offset;
>>>  	unsigned int lcore_id = 0;
>>>  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
>>> +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
>>>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>> +	const uint32_t mcore_id = rte_get_main_lcore();
>>>  	unsigned int buf_size = cfg->buf_size.cur;
>>>  	uint16_t kick_batch = cfg->kick_batch.cur;
>>> -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
>> (cfg->buf_size.cur * 2);
>>>  	uint16_t nb_workers = ldm->cnt;
>>>  	uint16_t test_secs = cfg->test_secs;
>>>  	float memory = 0;
>>> @@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>  	uint32_t avg_cycles_total;
>>>  	float mops, mops_total;
>>>  	float bandwidth, bandwidth_total;
>>> +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
>>> +	uint32_t nr_buf;
>>>  	int ret = 0;
>>>
>>> -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
>>> +	/* Align number of buffers according to workers count */
>>> +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
>>> +	nr_buf -= (nr_buf % nb_workers);
>>> +	if (cfg->is_sg) {
>>> +		nr_buf /= nb_workers;
>>> +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
>>> +		nr_buf *= nb_workers;
>>> +
>>> +		if (cfg->dst_ptrs > cfg->src_ptrs) {
>>> +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
>>> +			nr_sgdst = nr_buf;
>>> +		} else {
>>> +			nr_sgsrc = nr_buf;
>>> +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
>>> +		}
>>> +	}
>>
>> pls move to a subfunction
> Ack.
> 
>>
>>> +
>>> +	cfg->nr_buf = nr_buf;
>>> +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
>>>  		goto out;
>>>
>>> -	if (is_dma)
>>> +	if (cfg->is_dma)
>>>  		if (config_dmadevs(cfg) < 0)
>>>  			goto out;
>>>
>>> @@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>
>>>  	for (i = 0; i < nb_workers; i++) {
>>>  		lcore_id = ldm->lcores[i];
>>> +		if (lcore_id == mcore_id) {
>>> +			printf("lcore parameters can not use main core id
>> %d\n", mcore_id);
>>> +			goto out;
>>> +		}
>>> +
>>> +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
>>> +			printf("lcore parameters can not use offline core id
>> %d\n", lcore_id);
>>> +			goto out;
>>> +		}
>>
>> The above two judgement should in a seperate commit.
> 
> Sorry, somehow it got mixed from different patch I had in my local repo.
> It will be in different commit.
> 
>>
>>> +
>>>  		offset = nr_buf / nb_workers * i;
>>>  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
>>>  		if (lcores[i] == NULL) {
>>>  			printf("lcore parameters malloc failure for lcore %d\n",
>> lcore_id);
>>>  			break;
>>>  		}
>>> -		if (is_dma) {
>>> +		if (cfg->is_dma) {
>>>  			lcores[i]->dma_name = ldm->dma_names[i];
>>>  			lcores[i]->dev_id = ldm->dma_ids[i];
>>>  			lcores[i]->kick_batch = kick_batch;
>>> @@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>  		lcores[i]->scenario_id = cfg->scenario_id;
>>>  		lcores[i]->lcore_id = lcore_id;
>>>
>>> -		if (is_dma)
>>> -			rte_eal_remote_launch(do_dma_mem_copy, (void
>> *)(lcores[i]), lcore_id);
>>> -		else
>>> +		if (cfg->is_sg) {
>>> +			lcores[i]->src_ptrs = cfg->src_ptrs;
>>> +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
>>> +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers
>> * i);
>>> +			lcores[i]->dst_sges = dst_sges + (nr_sgdst /
>> nb_workers * i);
>>> +		}
>>> +
>>> +		if (cfg->is_dma) {
>>> +			if (!cfg->is_sg)
>>> +
>> 	rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
>>> +					lcore_id);
>>> +			else
>>> +
>> 	rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
>>> +					lcore_id);
>>> +		} else {
>>>  			rte_eal_remote_launch(do_cpu_mem_copy, (void
>> *)(lcores[i]), lcore_id);
>>> +		}
>>
>> too many judgement for selecting target function, how about wrap it
>> subfunction:
>> lcore_function_t get_work_function(struct test_configure *cfg)
>> then rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]),
>> lcore_id);
>>
> Ack.
> 
>>>  	}
>>>
>>>  	while (1) {
>>> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>
>>>  	rte_eal_mp_wait_lcore();
>>>
>>> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>> -			   rte_pktmbuf_mtod(dsts[i], void *),
>>> -			   cfg->buf_size.cur) != 0) {
>>> -			printf("Copy validation fails for buffer number %d\n",
>> i);
>>> -			ret = -1;
>>> -			goto out;
>>> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
>> {
>>> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>> +					rte_pktmbuf_mtod(dsts[i], void *),
>>> +					cfg->buf_size.cur) != 0) {
>>> +				printf("Copy validation fails for buffer number
>> %d\n", i);
>>> +				ret = -1;
>>> +				goto out;
>>> +			}
>>> +		}
>>> +	} else if (cfg->is_sg && cfg->transfer_dir ==
>> RTE_DMA_DIR_MEM_TO_MEM) {
>>> +		size_t src_remsz = buf_size % cfg->src_ptrs;
>>> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
>>> +		size_t src_sz = buf_size / cfg->src_ptrs;
>>> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
>>> +		uint8_t src[buf_size], dst[buf_size];
>>> +		uint8_t *sbuf, *dbuf, *ptr;
>>> +
>>> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
>> i++) {
>>> +			sbuf = src;
>>> +			dbuf = dst;
>>> +			ptr = NULL;
>>> +
>>> +			for (j = 0; j < cfg->src_ptrs; j++) {
>>> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
>> + j], uint8_t *);
>>> +				memcpy(sbuf, ptr, src_sz);
>>> +				sbuf += src_sz;
>>> +			}
>>> +
>>> +			if (src_remsz)
>>> +				memcpy(sbuf, ptr + src_sz, src_remsz);
>>> +
>>> +			for (j = 0; j < cfg->dst_ptrs; j++) {
>>> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
>> + j], uint8_t *);
>>> +				memcpy(dbuf, ptr, dst_sz);
>>> +				dbuf += dst_sz;
>>> +			}
>>> +
>>> +			if (dst_remsz)
>>> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
>>> +
>>> +			if (memcmp(src, dst, buf_size) != 0) {
>>> +				printf("SG Copy validation fails for buffer
>> number %d\n",
>>> +					i * cfg->src_ptrs);
>>> +				ret = -1;
>>> +				goto out;
>>> +			}
>>
>> Now I doubt the value of verify, this verify can't find the middle round copy
>> failure,
>> because as long as the last round copy is successful, the validation will pass.
>>
> Validation is on entire buffer. If any middle copy is a failure, entire memcmp
> would have failed. Or do I miss something ?
> 
>> And adding validatation in every round copy will impact performance.
>>
> This validation is just after worker function is stopped measuring perf.
> How would this impact performance ?
Yes, it will don't impact performance.
What I said before is that is not valid, pls consider following scene:
	while (1) {
		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs, let's defind this is a round copy.
dma_copy:
			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
			if (unlikely(ret < 0)) {
				if (ret == -ENOSPC) {
					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
					goto dma_copy;
				} else
					error_exit(dev_id);
			}
			async_cnt++;
			if ((async_cnt % kick_batch) == 0)
				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
		}
		if (worker_info->stop_flag)   // if don't stop, it will do many round copies.
			break;
	}
and the later validation just verify the last round, let's assume there are 100 round, and if the last round copy
work well, but round 0~98 both copy fail, then the validation will not detect it.
So if we want do all the validation, then we should add the velidation after every round copy, but it will
impact the performance.
> 
>> Also app/test_dmadev already verify data. so I think we should drop the
>> validation commit.
> 
> Even in some corner cases or unknown issues, copy would have failed
> and taking perf cycles then is meaningless. That is the reason, this validation
> is added after perf function doing its job.
How about:
	while (1) {
		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs, let's defind this is a round copy.
dma_copy:
			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
			if (unlikely(ret < 0)) {
				if (ret == -ENOSPC) {
					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
					goto dma_copy;
				} else
					error_exit(dev_id);
			}
			async_cnt++;
			if ((async_cnt % kick_batch) == 0)
				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
		}
		if (unlikely(work_info->verify)) {
			ret = verify();
			if (ret != 0) {
				// error trace,
				break;
			}
		}
		if (worker_info->stop_flag)   // if don't stop, it will do many round copies.
			break;
	}
and make this verify as a config entry
> 
>>
>>>  		}
>>>  	}
>>>
>>> @@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg,
>> bool is_dma)
>>>  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
>>>  			lcores[i]->worker_info.test_cpl,
>>>  			&memory, &avg_cycles, &bandwidth, &mops);
>>> -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
>>> -					lcores[i]->dma_name, cfg-
>>> ring_size.cur, kick_batch,
>>> -					avg_cycles, buf_size, nr_buf /
>> nb_workers, memory,
>>> -					bandwidth, mops, is_dma);
>>> +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
>>> +			nr_buf / nb_workers, memory, bandwidth, mops);
>>>  		mops_total += mops;
>>>  		bandwidth_total += bandwidth;
>>>  		avg_cycles_total += avg_cycles;
>>> @@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>  	rte_mempool_free(dst_pool);
>>>  	dst_pool = NULL;
>>>
>>> +	/* free sges for mbufs */
>>> +	rte_free(src_sges);
>>> +	src_sges = NULL;
>>> +
>>> +	rte_free(dst_sges);
>>> +	dst_sges = NULL;
>>> +
>>>  	/* free the worker parameters */
>>>  	for (i = 0; i < nb_workers; i++) {
>>>  		rte_free(lcores[i]);
>>>  		lcores[i] = NULL;
>>>  	}
>>>
>>> -	if (is_dma) {
>>> +	if (cfg->is_dma) {
>>>  		for (i = 0; i < nb_workers; i++) {
>>>  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
>>>  			rte_dma_stop(ldm->dma_ids[i]);
>>> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
>>> index 9c8221025e..28f6c9d1db 100644
>>> --- a/app/test-dma-perf/config.ini
>>> +++ b/app/test-dma-perf/config.ini
>>> @@ -38,6 +38,14 @@
>>>
>>>  ; "skip" To skip a test-case set skip to 1.
>>
>> Please place hese patchset new add entrys' descriptions above the
>> "; To specify a configuration file, use the "--config" flag followed by the path to
>> the file."
>>
>> because original config.ini, fist is parameters descriptor, and then program
>> argment descriptor, and last was example.
>>
> Ack.
>>>
>>> +; Parameters to be configured for SG copy:
>>
>> Parameters for DMA scatter-gather memory copy:
>>
> Ack.
>>> +; ========================================
>>
>> Please remove this line
>>
> Ack.
>>> +; "dma_src_sge" denotes number of source segments.
>>> +; "dma_dst_sge" denotes number of destination segments.
>>> +;
>>> +; For SG copy, both the parameters need to be configured and they are valid
>> only
>>> +; when type is DMA_MEM_COPY.
>>
>> For DMA scatter-gather memory copy, the parameters need to be configured
>> and they are valid only
>> when type is DMA_MEM_COPY.
>>
> Ack.
>>> +;
>>>  ; Parameters to be configured for data transfers from "mem to dev" and
>> "dev to mem":
>>>  ;
>> ===================================================================
>> ===============
>>
>> Please remove this line
>>
>> As another commit "Re: [PATCH v2] app/dma-perf: support bi-directional
>> transfer"'s review feedback,
>> these descriptor should place after
>> "
>> ; To use DMA for a test, please specify the "lcore_dma" parameter.
>> ; If you have already set the "-l" and "-a" parameters using EAL,
>> ; make sure that the value of "lcore_dma" falls within their range of the values.
>> ; We have to ensure a 1:1 mapping between the core and DMA device.
>> "
>>
>>
>>>  ; "direction" denotes the direction of data transfer. It can take 3 values:
>>> @@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2,
>> lcore11@0000:00:04.3
>>>  eal_args=--in-memory --file-prefix=test
>>>
>>>  [case2]
>>> +type=DMA_MEM_COPY
>>> +mem_size=10
>>> +buf_size=64,8192,2,MUL
>>> +dma_ring_size=1024
>>> +dma_src_sge=4
>>> +dma_dst_sge=1
>>> +kick_batch=32
>>> +src_numa_node=0
>>> +dst_numa_node=0
>>> +cache_flush=0
>>> +test_seconds=2
>>> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>>> +eal_args=--in-memory --file-prefix=test
>>> +
>>> +[case3]
>>>  skip=1
>>>  type=DMA_MEM_COPY
>>>  direction=dev2mem
>>> @@ -84,7 +107,7 @@ test_seconds=2
>>>  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>>>  eal_args=--in-memory --file-prefix=test
>>>
>>> -[case3]
>>> +[case4]
>>>  type=CPU_MEM_COPY
>>>  mem_size=10
>>>  buf_size=64,8192,2,MUL
>>> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
>>> index df05bcd7df..a27e4c9429 100644
>>> --- a/app/test-dma-perf/main.c
>>> +++ b/app/test-dma-perf/main.c
>>> @@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
>>>
>>>  	switch (case_cfg->test_type) {
>>>  	case TEST_TYPE_DMA_MEM_COPY:
>>> -		ret = mem_copy_benchmark(case_cfg, true);
>>> -		break;
>>>  	case TEST_TYPE_CPU_MEM_COPY:
>>> -		ret = mem_copy_benchmark(case_cfg, false);
>>> +		ret = mem_copy_benchmark(case_cfg);
>>>  		break;
>>>  	default:
>>>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
>>> @@ -365,7 +363,8 @@ load_configs(const char *path)
>>>  	const char *case_type;
>>>  	const char *transfer_dir;
>>>  	const char *lcore_dma;
>>> -	const char *mem_size_str, *buf_size_str, *ring_size_str,
>> *kick_batch_str;
>>> +	const char *mem_size_str, *buf_size_str, *ring_size_str,
>> *kick_batch_str,
>>> +		*src_ptrs_str, *dst_ptrs_str;
>>>  	const char *skip;
>>>  	struct rte_kvargs *kvlist;
>>>  	const char *vchan_dev;
>>> @@ -467,6 +466,7 @@ load_configs(const char *path)
>>>  			rte_kvargs_free(kvlist);
>>>  		}
>>>
>>> +		test_case->is_dma = is_dma;
>>>  		test_case->src_numa_node =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>>
>> 	section_name, "src_numa_node"));
>>>  		test_case->dst_numa_node =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>> @@ -501,6 +501,32 @@ load_configs(const char *path)
>>>  			} else if (args_nr == 4)
>>>  				nb_vp++;
>>>
>>> +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile,
>> section_name,
>>> +
>> 	"dma_src_sge");
>>> +			if (src_ptrs_str != NULL) {
>>> +				test_case->src_ptrs =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>> +
>> 	section_name, "dma_src_sge"));
>>> +			}
>>> +
>>> +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile,
>> section_name,
>>> +
>> 	"dma_dst_sge");
>>> +			if (dst_ptrs_str != NULL) {
>>> +				test_case->dst_ptrs =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>> +
>> 	section_name, "dma_dst_sge"));
>>> +			}
>>> +
>>> +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
>>> +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
>>
>> Please also check test_case->src_ptrs and test_case->dst_ptrs valid, make sure
>> there are >1 and <=UINT16_MAX
> 
> At present, this is uint8_t. Do we need it more than UINT8_MAX ?
ok
> 
>>
>>> +				printf("parse dma_src_sge, dma_dst_sge error
>> in case %d.\n",
>>> +					i + 1);
>>> +				test_case->is_valid = false;
>>> +				continue;
>>> +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL)
>> {
>>> +				test_case->is_sg = true;
>>> +			} else {
>>> +				test_case->is_sg = false;
>>
>> the above could simple by: test_case->is_sg = (src_ptrs_str != NULL &&
>> dst_ptrs_str != NULL);
>>
> Added check for nb_ validation here. Please check in next version of patch.
>>> +			}
>>> +
>>>  			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
>> section_name, "kick_batch");
>>>  			args_nr = parse_entry(kick_batch_str, &test_case-
>>> kick_batch);
>>>  			if (args_nr < 0) {
>>> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
>>> index 1123e7524a..baf149b72b 100644
>>> --- a/app/test-dma-perf/main.h
>>> +++ b/app/test-dma-perf/main.h
>>> @@ -53,11 +53,14 @@ struct test_configure {
>>>  	uint16_t dst_numa_node;
>>>  	uint16_t opcode;
>>>  	bool is_dma;
>>> +	bool is_sg;
>>>  	struct lcore_dma_map_t lcore_dma_map;
>>>  	struct test_configure_entry mem_size;
>>>  	struct test_configure_entry buf_size;
>>>  	struct test_configure_entry ring_size;
>>>  	struct test_configure_entry kick_batch;
>>> +	uint8_t src_ptrs;
>>> +	uint8_t dst_ptrs;
>>>  	uint8_t cache_flush;
>>>  	uint32_t nr_buf;
>>>  	uint16_t test_secs;
>>> @@ -66,6 +69,6 @@ struct test_configure {
>>>  	struct test_vchan_dev_config vchan_dev;
>>>  };
>>>
>>> -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
>>> +int mem_copy_benchmark(struct test_configure *cfg);
>>>
>>>  #endif /* MAIN_H */
>>>
> 
> Thank you for your review. Please confirm if there are any other changes
> and I hope next version goes through 😊
> 
> Regards,
> Gowrishankar
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
  2024-03-01  2:07                     ` fengchengwen
@ 2024-03-01  8:06                       ` Gowrishankar Muthukrishnan
  2024-03-01  9:45                         ` fengchengwen
  0 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-01  8:06 UTC (permalink / raw)
  To: fengchengwen, Amit Prakash Shukla, Cheng Jiang
  Cc: dev, Jerin Jacob, Anoob Joseph, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula
Hi Fengcheng,
<cut>
> >>> -output_result(uint8_t scenario_id, uint32_t lcore_id, char
> >>> *dma_name,
> >> uint16_t ring_size,
> >>> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> >> buf_size, uint32_t nr_buf,
> >>> -			float memory, float bandwidth, float mops, bool
> >> is_dma)
> >>> +output_result(struct test_configure *cfg, struct lcore_params *para,
> >>> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> >> buf_size,
> >>> +			uint32_t nr_buf, float memory, float bandwidth, float
> >> mops)
> >>>  {
> >>> -	if (is_dma)
> >>> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> >> %u.\n",
> >>> -				lcore_id, dma_name, ring_size, kick_batch);
> >>> -	else
> >>> +	uint16_t ring_size = cfg->ring_size.cur;
> >>> +	uint8_t scenario_id = cfg->scenario_id;
> >>> +	uint32_t lcore_id = para->lcore_id;
> >>> +	char *dma_name = para->dma_name;
> >>> +
> >>> +	if (cfg->is_dma) {
> >>> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> >> %u", lcore_id,
> >>> +		       dma_name, ring_size, kick_batch);
> >>> +		if (cfg->is_sg)
> >>> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> >>> +			       para->src_ptrs, para->dst_ptrs);
> >>
> >> DMA src sges: %u DMA dst sges: %u
> >>
> >> I think we should add a column which title maybe misc, some like
> >> sg-src[4]- dst[1], and later we may add fill test, then this field
> >> could be pattern-
> >> 0x12345678
> >>
> >> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit,
> >> if the DMA was worked in non-mem2mem direction, we could add simple
> >> descriptor of direction and pcie.info in the above misc column.
> >>
> >
> > I am sorry, I could not understand complete picture here. Do you mean
> > we reserve a column and use it as per test type.
> >
> > For plain mem copy, nothing added.
> > For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print
> "sg-src[1]-dst[4]".
> > In future, when we add fill test in benchmark, this line instead be "pattern-
> 0x12345678".
> >
> > Is my understanding correct over here ?
> 
> Yes, some like this.
> 
This patch adds SGE info in an alignment with existing output.
I think it is better to add further extensions as we add new features. Since the app doesn't support the features that you mentioned, it is difficult to anticipate the requirements.
In fact, if the additional frameworks that we put in are not useful for those features, it could lead to stale code.
I would prefer if we can make these changes as we add new features.
> >
<cut>
> >>>  	}
> >>>
> >>>  	while (1) {
> >>> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
> >> *cfg, bool is_dma)
> >>>
> >>>  	rte_eal_mp_wait_lcore();
> >>>
> >>> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> >>> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> >>> -			   rte_pktmbuf_mtod(dsts[i], void *),
> >>> -			   cfg->buf_size.cur) != 0) {
> >>> -			printf("Copy validation fails for buffer number %d\n",
> >> i);
> >>> -			ret = -1;
> >>> -			goto out;
> >>> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
> >> {
> >>> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> >>> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> >>> +					rte_pktmbuf_mtod(dsts[i], void *),
> >>> +					cfg->buf_size.cur) != 0) {
> >>> +				printf("Copy validation fails for buffer number
> >> %d\n", i);
> >>> +				ret = -1;
> >>> +				goto out;
> >>> +			}
> >>> +		}
> >>> +	} else if (cfg->is_sg && cfg->transfer_dir ==
> >> RTE_DMA_DIR_MEM_TO_MEM) {
> >>> +		size_t src_remsz = buf_size % cfg->src_ptrs;
> >>> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> >>> +		size_t src_sz = buf_size / cfg->src_ptrs;
> >>> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> >>> +		uint8_t src[buf_size], dst[buf_size];
> >>> +		uint8_t *sbuf, *dbuf, *ptr;
> >>> +
> >>> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
> >> i++) {
> >>> +			sbuf = src;
> >>> +			dbuf = dst;
> >>> +			ptr = NULL;
> >>> +
> >>> +			for (j = 0; j < cfg->src_ptrs; j++) {
> >>> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
> >> + j], uint8_t *);
> >>> +				memcpy(sbuf, ptr, src_sz);
> >>> +				sbuf += src_sz;
> >>> +			}
> >>> +
> >>> +			if (src_remsz)
> >>> +				memcpy(sbuf, ptr + src_sz, src_remsz);
> >>> +
> >>> +			for (j = 0; j < cfg->dst_ptrs; j++) {
> >>> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
> >> + j], uint8_t *);
> >>> +				memcpy(dbuf, ptr, dst_sz);
> >>> +				dbuf += dst_sz;
> >>> +			}
> >>> +
> >>> +			if (dst_remsz)
> >>> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> >>> +
> >>> +			if (memcmp(src, dst, buf_size) != 0) {
> >>> +				printf("SG Copy validation fails for buffer
> >> number %d\n",
> >>> +					i * cfg->src_ptrs);
> >>> +				ret = -1;
> >>> +				goto out;
> >>> +			}
> >>
> >> Now I doubt the value of verify, this verify can't find the middle
> >> round copy failure, because as long as the last round copy is
> >> successful, the validation will pass.
> >>
> > Validation is on entire buffer. If any middle copy is a failure,
> > entire memcmp would have failed. Or do I miss something ?
> >
> >> And adding validatation in every round copy will impact performance.
> >>
> > This validation is just after worker function is stopped measuring perf.
> > How would this impact performance ?
> 
> Yes, it will don't impact performance.
> 
> What I said before is that is not valid, pls consider following scene:
> 
> 
> 	while (1) {
> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
> let's defind this is a round copy.
> dma_copy:
> 			ret = rte_dma_copy(dev_id, 0,
> rte_mbuf_data_iova(srcs[i]),
> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
> 			if (unlikely(ret < 0)) {
> 				if (ret == -ENOSPC) {
> 					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 					goto dma_copy;
> 				} else
> 					error_exit(dev_id);
> 			}
> 			async_cnt++;
> 
> 			if ((async_cnt % kick_batch) == 0)
> 				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 		}
> 
> 		if (worker_info->stop_flag)   // if don't stop, it will do many
> round copies.
> 			break;
> 	}
> 
> and the later validation just verify the last round, let's assume there are 100
> round, and if the last round copy work well, but round 0~98 both copy fail,
> then the validation will not detect it.
> 
> 
> So if we want do all the validation, then we should add the velidation after
> every round copy, but it will impact the performance.
> 
> 
> >
> >> Also app/test_dmadev already verify data. so I think we should drop
> >> the validation commit.
> >
> > Even in some corner cases or unknown issues, copy would have failed
> > and taking perf cycles then is meaningless. That is the reason, this
> > validation is added after perf function doing its job.
> 
> How about:
> 
> 	while (1) {
> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
> let's defind this is a round copy.
> dma_copy:
> 			ret = rte_dma_copy(dev_id, 0,
> rte_mbuf_data_iova(srcs[i]),
> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
> 			if (unlikely(ret < 0)) {
> 				if (ret == -ENOSPC) {
> 					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 					goto dma_copy;
> 				} else
> 					error_exit(dev_id);
> 			}
> 			async_cnt++;
> 
> 			if ((async_cnt % kick_batch) == 0)
> 				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 		}
> 
> 		if (unlikely(work_info->verify)) {
> 			ret = verify();
> 			if (ret != 0) {
> 				// error trace,
> 				break;
> 			}
> 		}
> 
> 		if (worker_info->stop_flag)   // if don't stop, it will do many
> round copies.
> 			break;
> 	}
> 
> and make this verify as a config entry
I believe there is a difference in understanding of what this is intended to do. Intention here is not to validate every operation done by DMA, and that is already taken care by UT.
Is it possible that we are we misreporting numbers if the application is buggy or PMD is misbehaving for the scenario under test and the copies are not actually performed? Yes. Think about a scenario where PMD is buggy when trying bursts of more than 1.
Checking last set of buffers is more like testing a sample from the perf test to make sure perf test was indeed performing what it is claiming to do. If you think it is unnecessary to do so, we can drop this from upstream. But adding complete verification in performance app would be repeating what a unit test is expected to do. I would suggest not to do that.
Thanks,
Gowrishankar
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [EXTERNAL] Re: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
  2024-03-01  8:06                       ` [EXTERNAL] " Gowrishankar Muthukrishnan
@ 2024-03-01  9:45                         ` fengchengwen
  0 siblings, 0 replies; 79+ messages in thread
From: fengchengwen @ 2024-03-01  9:45 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, Amit Prakash Shukla, Cheng Jiang
  Cc: dev, Jerin Jacob, Anoob Joseph, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula
On 2024/3/1 16:06, Gowrishankar Muthukrishnan wrote:
> Hi Fengcheng,
> 
> <cut>
>>>>> -output_result(uint8_t scenario_id, uint32_t lcore_id, char
>>>>> *dma_name,
>>>> uint16_t ring_size,
>>>>> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>>>> buf_size, uint32_t nr_buf,
>>>>> -			float memory, float bandwidth, float mops, bool
>>>> is_dma)
>>>>> +output_result(struct test_configure *cfg, struct lcore_params *para,
>>>>> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>>>> buf_size,
>>>>> +			uint32_t nr_buf, float memory, float bandwidth, float
>>>> mops)
>>>>>  {
>>>>> -	if (is_dma)
>>>>> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>>>> %u.\n",
>>>>> -				lcore_id, dma_name, ring_size, kick_batch);
>>>>> -	else
>>>>> +	uint16_t ring_size = cfg->ring_size.cur;
>>>>> +	uint8_t scenario_id = cfg->scenario_id;
>>>>> +	uint32_t lcore_id = para->lcore_id;
>>>>> +	char *dma_name = para->dma_name;
>>>>> +
>>>>> +	if (cfg->is_dma) {
>>>>> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>>>> %u", lcore_id,
>>>>> +		       dma_name, ring_size, kick_batch);
>>>>> +		if (cfg->is_sg)
>>>>> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
>>>>> +			       para->src_ptrs, para->dst_ptrs);
>>>>
>>>> DMA src sges: %u DMA dst sges: %u
>>>>
>>>> I think we should add a column which title maybe misc, some like
>>>> sg-src[4]- dst[1], and later we may add fill test, then this field
>>>> could be pattern-
>>>> 0x12345678
>>>>
>>>> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit,
>>>> if the DMA was worked in non-mem2mem direction, we could add simple
>>>> descriptor of direction and pcie.info in the above misc column.
>>>>
>>>
>>> I am sorry, I could not understand complete picture here. Do you mean
>>> we reserve a column and use it as per test type.
>>>
>>> For plain mem copy, nothing added.
>>> For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print
>> "sg-src[1]-dst[4]".
>>> In future, when we add fill test in benchmark, this line instead be "pattern-
>> 0x12345678".
>>>
>>> Is my understanding correct over here ?
>>
>> Yes, some like this.
>>
> This patch adds SGE info in an alignment with existing output.
> 
> I think it is better to add further extensions as we add new features. Since the app doesn't support the features that you mentioned, it is difficult to anticipate the requirements.
> In fact, if the additional frameworks that we put in are not useful for those features, it could lead to stale code.
> I would prefer if we can make these changes as we add new features.
> 
>>>
> <cut>
>>>>>  	}
>>>>>
>>>>>  	while (1) {
>>>>> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
>>>> *cfg, bool is_dma)
>>>>>
>>>>>  	rte_eal_mp_wait_lcore();
>>>>>
>>>>> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>>>> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>>>> -			   rte_pktmbuf_mtod(dsts[i], void *),
>>>>> -			   cfg->buf_size.cur) != 0) {
>>>>> -			printf("Copy validation fails for buffer number %d\n",
>>>> i);
>>>>> -			ret = -1;
>>>>> -			goto out;
>>>>> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
>>>> {
>>>>> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>>>> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>>>> +					rte_pktmbuf_mtod(dsts[i], void *),
>>>>> +					cfg->buf_size.cur) != 0) {
>>>>> +				printf("Copy validation fails for buffer number
>>>> %d\n", i);
>>>>> +				ret = -1;
>>>>> +				goto out;
>>>>> +			}
>>>>> +		}
>>>>> +	} else if (cfg->is_sg && cfg->transfer_dir ==
>>>> RTE_DMA_DIR_MEM_TO_MEM) {
>>>>> +		size_t src_remsz = buf_size % cfg->src_ptrs;
>>>>> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
>>>>> +		size_t src_sz = buf_size / cfg->src_ptrs;
>>>>> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
>>>>> +		uint8_t src[buf_size], dst[buf_size];
>>>>> +		uint8_t *sbuf, *dbuf, *ptr;
>>>>> +
>>>>> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
>>>> i++) {
>>>>> +			sbuf = src;
>>>>> +			dbuf = dst;
>>>>> +			ptr = NULL;
>>>>> +
>>>>> +			for (j = 0; j < cfg->src_ptrs; j++) {
>>>>> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
>>>> + j], uint8_t *);
>>>>> +				memcpy(sbuf, ptr, src_sz);
>>>>> +				sbuf += src_sz;
>>>>> +			}
>>>>> +
>>>>> +			if (src_remsz)
>>>>> +				memcpy(sbuf, ptr + src_sz, src_remsz);
>>>>> +
>>>>> +			for (j = 0; j < cfg->dst_ptrs; j++) {
>>>>> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
>>>> + j], uint8_t *);
>>>>> +				memcpy(dbuf, ptr, dst_sz);
>>>>> +				dbuf += dst_sz;
>>>>> +			}
>>>>> +
>>>>> +			if (dst_remsz)
>>>>> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
>>>>> +
>>>>> +			if (memcmp(src, dst, buf_size) != 0) {
>>>>> +				printf("SG Copy validation fails for buffer
>>>> number %d\n",
>>>>> +					i * cfg->src_ptrs);
>>>>> +				ret = -1;
>>>>> +				goto out;
>>>>> +			}
>>>>
>>>> Now I doubt the value of verify, this verify can't find the middle
>>>> round copy failure, because as long as the last round copy is
>>>> successful, the validation will pass.
>>>>
>>> Validation is on entire buffer. If any middle copy is a failure,
>>> entire memcmp would have failed. Or do I miss something ?
>>>
>>>> And adding validatation in every round copy will impact performance.
>>>>
>>> This validation is just after worker function is stopped measuring perf.
>>> How would this impact performance ?
>>
>> Yes, it will don't impact performance.
>>
>> What I said before is that is not valid, pls consider following scene:
>>
>>
>> 	while (1) {
>> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
>> let's defind this is a round copy.
>> dma_copy:
>> 			ret = rte_dma_copy(dev_id, 0,
>> rte_mbuf_data_iova(srcs[i]),
>> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>> 			if (unlikely(ret < 0)) {
>> 				if (ret == -ENOSPC) {
>> 					do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 					goto dma_copy;
>> 				} else
>> 					error_exit(dev_id);
>> 			}
>> 			async_cnt++;
>>
>> 			if ((async_cnt % kick_batch) == 0)
>> 				do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 		}
>>
>> 		if (worker_info->stop_flag)   // if don't stop, it will do many
>> round copies.
>> 			break;
>> 	}
>>
>> and the later validation just verify the last round, let's assume there are 100
>> round, and if the last round copy work well, but round 0~98 both copy fail,
>> then the validation will not detect it.
>>
>>
>> So if we want do all the validation, then we should add the velidation after
>> every round copy, but it will impact the performance.
>>
>>
>>>
>>>> Also app/test_dmadev already verify data. so I think we should drop
>>>> the validation commit.
>>>
>>> Even in some corner cases or unknown issues, copy would have failed
>>> and taking perf cycles then is meaningless. That is the reason, this
>>> validation is added after perf function doing its job.
>>
>> How about:
>>
>> 	while (1) {
>> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
>> let's defind this is a round copy.
>> dma_copy:
>> 			ret = rte_dma_copy(dev_id, 0,
>> rte_mbuf_data_iova(srcs[i]),
>> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>> 			if (unlikely(ret < 0)) {
>> 				if (ret == -ENOSPC) {
>> 					do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 					goto dma_copy;
>> 				} else
>> 					error_exit(dev_id);
>> 			}
>> 			async_cnt++;
>>
>> 			if ((async_cnt % kick_batch) == 0)
>> 				do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 		}
>>
>> 		if (unlikely(work_info->verify)) {
>> 			ret = verify();
>> 			if (ret != 0) {
>> 				// error trace,
>> 				break;
>> 			}
>> 		}
>>
>> 		if (worker_info->stop_flag)   // if don't stop, it will do many
>> round copies.
>> 			break;
>> 	}
>>
>> and make this verify as a config entry
> 
> I believe there is a difference in understanding of what this is intended to do. Intention here is not to validate every operation done by DMA, and that is already taken care by UT.
> 
> Is it possible that we are we misreporting numbers if the application is buggy or PMD is misbehaving for the scenario under test and the copies are not actually performed? Yes. Think about a scenario where PMD is buggy when trying bursts of more than 1.
> 
> Checking last set of buffers is more like testing a sample from the perf test to make sure perf test was indeed performing what it is claiming to do. If you think it is unnecessary to do so, we can drop this from upstream. But adding complete verification in performance app would be repeating what a unit test is expected to do. I would suggest not to do that.
I think this commit is mainly test the dma-perf itself.
OK with continue this commit.
Thanks
> 
> Thanks,
> Gowrishankar
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [v11 0/4] PCI Dev and SG copy support
  2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
                                     ` (3 preceding siblings ...)
  2024-02-29 13:48                   ` [v11 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
@ 2024-03-06 19:50                   ` Thomas Monjalon
  2024-03-07 13:48                     ` fengchengwen
  2024-03-07 13:48                     ` Gowrishankar Muthukrishnan
  4 siblings, 2 replies; 79+ messages in thread
From: Thomas Monjalon @ 2024-03-06 19:50 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan
  Cc: dev, anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Chengwen Feng, Jerin
29/02/2024 14:48, Gowrishankar Muthukrishnan:
> Improve dma-perf application to support PCI dev and SG copy,
> along with additional supports as below:
>  - validate copied memory
>  - skip tests if not opted.
> 
> v11:
> - Review suggestions.
> 
> Gowrishankar Muthukrishnan (4):
>   app/dma-perf: add skip support
>   app/dma-perf: add PCI device support
>   app/dma-perf: validate copied memory
>   app/dma-perf: add SG copy support
Waiting for a confirmation that this series is good to go.
Recheck-request: iol-unit-amd64-testing
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-06 19:50                   ` [v11 0/4] PCI Dev and " Thomas Monjalon
@ 2024-03-07 13:48                     ` fengchengwen
  2024-03-07 13:55                       ` [EXTERNAL] " Gowrishankar Muthukrishnan
  2024-03-07 13:48                     ` Gowrishankar Muthukrishnan
  1 sibling, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-03-07 13:48 UTC (permalink / raw)
  To: Thomas Monjalon, Gowrishankar Muthukrishnan
  Cc: dev, anoobj, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh, Amit Prakash Shukla, Jerin
Hi Thomas and Gowrishankar,
On 2024/3/7 3:50, Thomas Monjalon wrote:
> 29/02/2024 14:48, Gowrishankar Muthukrishnan:
>> Improve dma-perf application to support PCI dev and SG copy,
>> along with additional supports as below:
>>  - validate copied memory
>>  - skip tests if not opted.
>>
>> v11:
>> - Review suggestions.
>>
>> Gowrishankar Muthukrishnan (4):
>>   app/dma-perf: add skip support
>>   app/dma-perf: add PCI device support
>>   app/dma-perf: validate copied memory
>>   app/dma-perf: add SG copy support
> 
> Waiting for a confirmation that this series is good to go.
In the discuss of thread [1], I hope this patchset continue take a step
forward (means new version) to support bi-direction test just by modify
config.ini file.
[1] [PATCH v2] app/dma-perf: support bi-directional transfer
Thanks
> 
> Recheck-request: iol-unit-amd64-testing
> 
> 
> .
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-06 19:50                   ` [v11 0/4] PCI Dev and " Thomas Monjalon
  2024-03-07 13:48                     ` fengchengwen
@ 2024-03-07 13:48                     ` Gowrishankar Muthukrishnan
  1 sibling, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-07 13:48 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Chengwen Feng,
	Jerin Jacob
Hi Thomas,
> 29/02/2024 14:48, Gowrishankar Muthukrishnan:
> > Improve dma-perf application to support PCI dev and SG copy, along
> > with additional supports as below:
> >  - validate copied memory
> >  - skip tests if not opted.
> >
> > v11:
> > - Review suggestions.
> >
> > Gowrishankar Muthukrishnan (4):
> >   app/dma-perf: add skip support
> >   app/dma-perf: add PCI device support
> >   app/dma-perf: validate copied memory
> >   app/dma-perf: add SG copy support
> 
> Waiting for a confirmation that this series is good to go.
> 
Patches are already acked by maintainer and is ready for merge.
Thanks,
Gowrishankar
> Recheck-request: iol-unit-amd64-testing
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-07 13:48                     ` fengchengwen
@ 2024-03-07 13:55                       ` Gowrishankar Muthukrishnan
  2024-03-12  9:15                         ` Thomas Monjalon
  0 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-07 13:55 UTC (permalink / raw)
  To: fengchengwen, Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
Hi Fengchengwen,
> > Waiting for a confirmation that this series is good to go.
> 
> In the discuss of thread [1], I hope this patchset continue take a step forward
> (means new version) to support bi-direction test just by modify config.ini file.
> 
This patch set already exposes all configuration via config.ini. I didn't follow what is missing. For bi-direction, we can better continue discussing on that patch.
Thanks,
Gowrishankar
> [1] [PATCH v2] app/dma-perf: support bi-directional transfer
> 
> Thanks
> 
> >
> > Recheck-request: iol-unit-amd64-testing
> >
> >
> > .
> >
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-07 13:55                       ` [EXTERNAL] " Gowrishankar Muthukrishnan
@ 2024-03-12  9:15                         ` Thomas Monjalon
  2024-03-12 12:05                           ` fengchengwen
  0 siblings, 1 reply; 79+ messages in thread
From: Thomas Monjalon @ 2024-03-12  9:15 UTC (permalink / raw)
  To: fengchengwen
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob,
	Gowrishankar Muthukrishnan
[-- Attachment #1: Type: text/plain, Size: 583 bytes --]
07/03/2024 14:55, Gowrishankar Muthukrishnan:
> Hi Fengchengwen,
> 
> > > Waiting for a confirmation that this series is good to go.
> > 
> > In the discuss of thread [1], I hope this patchset continue take a step forward
> > (means new version) to support bi-direction test just by modify config.ini file.
> > 
> 
> This patch set already exposes all configuration via config.ini. I didn't follow what is missing. For bi-direction, we can better continue discussing on that patch.
Chengwen, please can you confirm whether you require a new version?
Which change exactly is missing?
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-12  9:15                         ` Thomas Monjalon
@ 2024-03-12 12:05                           ` fengchengwen
  2024-03-12 12:24                             ` Gowrishankar Muthukrishnan
  0 siblings, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-03-12 12:05 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob,
	Gowrishankar Muthukrishnan
Hi Thomas,
On 2024/3/12 17:15, Thomas Monjalon wrote:
> 07/03/2024 14:55, Gowrishankar Muthukrishnan:
>> Hi Fengchengwen,
>>
>>>> Waiting for a confirmation that this series is good to go.
>>>
>>> In the discuss of thread [1], I hope this patchset continue take a step forward
>>> (means new version) to support bi-direction test just by modify config.ini file.
>>>
>>
>> This patch set already exposes all configuration via config.ini. I didn't follow what is missing. For bi-direction, we can better continue discussing on that patch.
> 
> Chengwen, please can you confirm whether you require a new version?
> Which change exactly is missing?
This patchset is OK with one sub-test only tackle one DMA direction.
But there is a later patch [1] which will support multiple DMA directions within one sub-test.
it will add a entry "xfer_mode", but I think it complicate the test, I prefer we do more in
this patchset to support some like bi-direction just by modify config.ini, some like this:
1. extend lcore_dma:
   current lcore_dma is: lcore10@0000:00:04.2
   extend it support: lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
2. to fix one entry can't hold too many dma device, support entrys: lcore_dma_1, lcore_dma_2
   which value is same with lcore_dma.
So for bi-direction, we just define config.ini as:
lcore_dma=lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0xXXXX, lcore10@0000:00:04.2,dir=d2m,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
[1] https://patches.dpdk.org/project/dpdk/patch/20240229141426.4188428-1-amitprakashs@marvell.com/
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-12 12:05                           ` fengchengwen
@ 2024-03-12 12:24                             ` Gowrishankar Muthukrishnan
  2024-03-13  7:26                               ` fengchengwen
  0 siblings, 1 reply; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-12 12:24 UTC (permalink / raw)
  To: fengchengwen, Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
Hi Fengchengwen
> 
> Hi Thomas,
> 
> On 2024/3/12 17:15, Thomas Monjalon wrote:
> > 07/03/2024 14:55, Gowrishankar Muthukrishnan:
> >> Hi Fengchengwen,
> >>
> >>>> Waiting for a confirmation that this series is good to go.
> >>>
> >>> In the discuss of thread [1], I hope this patchset continue take a
> >>> step forward (means new version) to support bi-direction test just by
> modify config.ini file.
> >>>
> >>
> >> This patch set already exposes all configuration via config.ini. I didn't follow
> what is missing. For bi-direction, we can better continue discussing on that
> patch.
> >
> > Chengwen, please can you confirm whether you require a new version?
> > Which change exactly is missing?
> 
> This patchset is OK with one sub-test only tackle one DMA direction.
> 
Thanks for the confirmation.
> But there is a later patch [1] which will support multiple DMA directions within
> one sub-test.
> it will add a entry "xfer_mode", but I think it complicate the test, I prefer we do
> more in this patchset to support some like bi-direction just by modify
> config.ini, some like this:
> 
I think we should discuss about that in bi-directional patch series. This series is self-contained and there is no need to add bi-directional as part of this series. As far as this patch set is concerned, all the options are exposed via config.ini. Can you comment if there is anything missing, assuming that we are taking bi-directional support as a separate feature addition.
Thanks,
Gowrishankar
> 1. extend lcore_dma:
>    current lcore_dma is: lcore10@0000:00:04.2
>    extend it support:
> lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
> 2. to fix one entry can't hold too many dma device, support entrys:
> lcore_dma_1, lcore_dma_2
>    which value is same with lcore_dma.
> 
> So for bi-direction, we just define config.ini as:
> lcore_dma=lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0x
> XXXX, lcore10@0000:00:04.2,dir=d2m,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
> 
> [1] https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__patches.dpdk.org_project_dpdk_patch_20240229141426.4188428-
> 2D1-2Damitprakashs-
> 40marvell.com_&d=DwICaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=EAtr-
> g7yUFhtOio8r2Rtm13Aqe4WVp_S_gHpcu6KFVo&m=DUaL_AJR1zqM0T2yw3
> aV44EObB90uqw5weFzSm-
> w39citSeGozNdEe4kzicss_KG&s=UTAcoZx5DjSJHyzxyLMxXz1bPqfPXQM7feDx
> ZdC6Jgk&e=
> 
> >
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-12 12:24                             ` Gowrishankar Muthukrishnan
@ 2024-03-13  7:26                               ` fengchengwen
  2024-03-13  8:22                                 ` Gowrishankar Muthukrishnan
  0 siblings, 1 reply; 79+ messages in thread
From: fengchengwen @ 2024-03-13  7:26 UTC (permalink / raw)
  To: Gowrishankar Muthukrishnan, Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
Hi Gowrishankar,
On 2024/3/12 20:24, Gowrishankar Muthukrishnan wrote:
> Hi Fengchengwen
> 
>>
>> Hi Thomas,
>>
>> On 2024/3/12 17:15, Thomas Monjalon wrote:
>>> 07/03/2024 14:55, Gowrishankar Muthukrishnan:
>>>> Hi Fengchengwen,
>>>>
>>>>>> Waiting for a confirmation that this series is good to go.
>>>>>
>>>>> In the discuss of thread [1], I hope this patchset continue take a
>>>>> step forward (means new version) to support bi-direction test just by
>> modify config.ini file.
>>>>>
>>>>
>>>> This patch set already exposes all configuration via config.ini. I didn't follow
>> what is missing. For bi-direction, we can better continue discussing on that
>> patch.
>>>
>>> Chengwen, please can you confirm whether you require a new version?
>>> Which change exactly is missing?
>>
>> This patchset is OK with one sub-test only tackle one DMA direction.
>>
> Thanks for the confirmation.
> 
>> But there is a later patch [1] which will support multiple DMA directions within
>> one sub-test.
>> it will add a entry "xfer_mode", but I think it complicate the test, I prefer we do
>> more in this patchset to support some like bi-direction just by modify
>> config.ini, some like this:
>>
> I think we should discuss about that in bi-directional patch series. This series is self-contained and there is no need to add bi-directional as part of this series. As far as this patch set is concerned, all the options are exposed via config.ini. Can you comment if there is anything missing, assuming that we are taking bi-directional support as a separate feature addition.
I have identified some improvements to the dma-perf app, and I plan to do it in
24.07, so if you don't mind, I will incorporate your commits (keeping your signed-off-by) and
modify to the one that I described above, and then send to community (also with my improvements
commits).
Thanks
> 
> Thanks,
> Gowrishankar
> 
>> 1. extend lcore_dma:
>>    current lcore_dma is: lcore10@0000:00:04.2
>>    extend it support:
>> lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
>> 2. to fix one entry can't hold too many dma device, support entrys:
>> lcore_dma_1, lcore_dma_2
>>    which value is same with lcore_dma.
>>
>> So for bi-direction, we just define config.ini as:
>> lcore_dma=lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0x
>> XXXX, lcore10@0000:00:04.2,dir=d2m,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
>>
>> [1] https://urldefense.proofpoint.com/v2/url?u=https-
>> 3A__patches.dpdk.org_project_dpdk_patch_20240229141426.4188428-
>> 2D1-2Damitprakashs-
>> 40marvell.com_&d=DwICaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=EAtr-
>> g7yUFhtOio8r2Rtm13Aqe4WVp_S_gHpcu6KFVo&m=DUaL_AJR1zqM0T2yw3
>> aV44EObB90uqw5weFzSm-
>> w39citSeGozNdEe4kzicss_KG&s=UTAcoZx5DjSJHyzxyLMxXz1bPqfPXQM7feDx
>> ZdC6Jgk&e=
>>
>>>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-13  7:26                               ` fengchengwen
@ 2024-03-13  8:22                                 ` Gowrishankar Muthukrishnan
  2024-03-15  7:30                                   ` Gowrishankar Muthukrishnan
  2024-03-15 13:09                                   ` Thomas Monjalon
  0 siblings, 2 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-13  8:22 UTC (permalink / raw)
  To: fengchengwen, Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
Hi Fengchengwen
> Hi Gowrishankar,
> 
> On 2024/3/12 20:24, Gowrishankar Muthukrishnan wrote:
> > Hi Fengchengwen
> >
> >>
> >> Hi Thomas,
> >>
> >> On 2024/3/12 17:15, Thomas Monjalon wrote:
> >>> 07/03/2024 14:55, Gowrishankar Muthukrishnan:
> >>>> Hi Fengchengwen,
> >>>>
> >>>>>> Waiting for a confirmation that this series is good to go.
> >>>>>
> >>>>> In the discuss of thread [1], I hope this patchset continue take a
> >>>>> step forward (means new version) to support bi-direction test just
> >>>>> by
> >> modify config.ini file.
> >>>>>
> >>>>
> >>>> This patch set already exposes all configuration via config.ini. I
> >>>> didn't follow
> >> what is missing. For bi-direction, we can better continue discussing
> >> on that patch.
> >>>
> >>> Chengwen, please can you confirm whether you require a new version?
> >>> Which change exactly is missing?
> >>
> >> This patchset is OK with one sub-test only tackle one DMA direction.
> >>
> > Thanks for the confirmation.
> >
> >> But there is a later patch [1] which will support multiple DMA
> >> directions within one sub-test.
> >> it will add a entry "xfer_mode", but I think it complicate the test,
> >> I prefer we do more in this patchset to support some like
> >> bi-direction just by modify config.ini, some like this:
> >>
> > I think we should discuss about that in bi-directional patch series. This series
> is self-contained and there is no need to add bi-directional as part of this
> series. As far as this patch set is concerned, all the options are exposed via
> config.ini. Can you comment if there is anything missing, assuming that we are
> taking bi-directional support as a separate feature addition.
> 
> I have identified some improvements to the dma-perf app, and I plan to do it
It is unclear at this point what is the issue that you have with the app or this patch set. This series was first submitted on Aug 10 2023. You had acked v8 on Jan 25 2024. After the patches were acked, there were still review comments on variable renames etc, which were all addressed. The patches had been under review for more than 8 months with very slow progress.
> in 24.07, so if you don't mind, I will incorporate your commits (keeping your
> signed-off-by) and modify to the one that I described above, and then send to
> community (also with my improvements commits).
I would like to have this series merged first and not pulled into another series. We do have few other features that we would like to add on top. I would assume that you can also add your changes on top. To make contribution easier, isn't it better to accept at least this patch set (as you acked earlier) and then you can continue working on the improvements?
Thanks,
Gowrishankar
> 
> Thanks
> 
> >
> > Thanks,
> > Gowrishankar
> >
> >> 1. extend lcore_dma:
> >>    current lcore_dma is: lcore10@0000:00:04.2
> >>    extend it support:
> >> lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
> >> 2. to fix one entry can't hold too many dma device, support entrys:
> >> lcore_dma_1, lcore_dma_2
> >>    which value is same with lcore_dma.
> >>
> >> So for bi-direction, we just define config.ini as:
> >>
> lcore_dma=lcore10@0000:00:04.2,dir=m2d,coreid=1,pfid=2,vfid=3,raddr=0
> >> x XXXX,
> >> lcore10@0000:00:04.2,dir=d2m,coreid=1,pfid=2,vfid=3,raddr=0xXXXX
> >>
> >> [1] https://urldefense.proofpoint.com/v2/url?u=https-
> >> 3A__patches.dpdk.org_project_dpdk_patch_20240229141426.4188428-
> >> 2D1-2Damitprakashs-
> >> 40marvell.com_&d=DwICaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=EAtr-
> >>
> g7yUFhtOio8r2Rtm13Aqe4WVp_S_gHpcu6KFVo&m=DUaL_AJR1zqM0T2yw3
> >> aV44EObB90uqw5weFzSm-
> >>
> w39citSeGozNdEe4kzicss_KG&s=UTAcoZx5DjSJHyzxyLMxXz1bPqfPXQM7feDx
> >> ZdC6Jgk&e=
> >>
> >>>
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-13  8:22                                 ` Gowrishankar Muthukrishnan
@ 2024-03-15  7:30                                   ` Gowrishankar Muthukrishnan
  2024-03-15 13:09                                   ` Thomas Monjalon
  1 sibling, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-15  7:30 UTC (permalink / raw)
  To: fengchengwen, Thomas Monjalon
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
Hi Thomas,
> 
> Hi Fengchengwen
> 
> > Hi Gowrishankar,
> >
> > On 2024/3/12 20:24, Gowrishankar Muthukrishnan wrote:
> > > Hi Fengchengwen
> > >
> > >>
> > >> Hi Thomas,
> > >>
> > >> On 2024/3/12 17:15, Thomas Monjalon wrote:
> > >>> 07/03/2024 14:55, Gowrishankar Muthukrishnan:
> > >>>> Hi Fengchengwen,
> > >>>>
> > >>>>>> Waiting for a confirmation that this series is good to go.
> > >>>>>
> > >>>>> In the discuss of thread [1], I hope this patchset continue take
> > >>>>> a step forward (means new version) to support bi-direction test
> > >>>>> just by
> > >> modify config.ini file.
> > >>>>>
> > >>>>
> > >>>> This patch set already exposes all configuration via config.ini.
> > >>>> I didn't follow
> > >> what is missing. For bi-direction, we can better continue
> > >> discussing on that patch.
> > >>>
> > >>> Chengwen, please can you confirm whether you require a new version?
> > >>> Which change exactly is missing?
> > >>
> > >> This patchset is OK with one sub-test only tackle one DMA direction.
> > >>
> > > Thanks for the confirmation.
> > >
> > >> But there is a later patch [1] which will support multiple DMA
> > >> directions within one sub-test.
> > >> it will add a entry "xfer_mode", but I think it complicate the
> > >> test, I prefer we do more in this patchset to support some like
> > >> bi-direction just by modify config.ini, some like this:
> > >>
> > > I think we should discuss about that in bi-directional patch series.
> > > This series
> > is self-contained and there is no need to add bi-directional as part
> > of this series. As far as this patch set is concerned, all the options
> > are exposed via config.ini. Can you comment if there is anything
> > missing, assuming that we are taking bi-directional support as a separate
> feature addition.
> >
> > I have identified some improvements to the dma-perf app, and I plan to
> > do it
> 
> It is unclear at this point what is the issue that you have with the app or this
> patch set. This series was first submitted on Aug 10 2023. You had acked v8 on
> Jan 25 2024. After the patches were acked, there were still review comments
> on variable renames etc, which were all addressed. The patches had been
> under review for more than 8 months with very slow progress.
> 
> > in 24.07, so if you don't mind, I will incorporate your commits
> > (keeping your
> > signed-off-by) and modify to the one that I described above, and then
> > send to community (also with my improvements commits).
> 
> I would like to have this series merged first and not pulled into another series.
> We do have few other features that we would like to add on top. I would
> assume that you can also add your changes on top. To make contribution
> easier, isn't it better to accept at least this patch set (as you acked earlier) and
> then you can continue working on the improvements?
> 
Can this series be merged ?
Thanks,
Gowrishankar
> Thanks,
> Gowrishankar
> 
^ permalink raw reply	[flat|nested] 79+ messages in thread
* Re: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-13  8:22                                 ` Gowrishankar Muthukrishnan
  2024-03-15  7:30                                   ` Gowrishankar Muthukrishnan
@ 2024-03-15 13:09                                   ` Thomas Monjalon
  2024-03-18  7:32                                     ` Gowrishankar Muthukrishnan
  1 sibling, 1 reply; 79+ messages in thread
From: Thomas Monjalon @ 2024-03-15 13:09 UTC (permalink / raw)
  To: fengchengwen, Gowrishankar Muthukrishnan
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
13/03/2024 09:22, Gowrishankar Muthukrishnan:
> Hi Fengchengwen
> 
> > Hi Gowrishankar,
> > 
> > On 2024/3/12 20:24, Gowrishankar Muthukrishnan wrote:
> > > Hi Fengchengwen
> > >
> > >>
> > >> Hi Thomas,
> > >>
> > >> On 2024/3/12 17:15, Thomas Monjalon wrote:
> > >>> 07/03/2024 14:55, Gowrishankar Muthukrishnan:
> > >>>> Hi Fengchengwen,
> > >>>>
> > >>>>>> Waiting for a confirmation that this series is good to go.
> > >>>>>
> > >>>>> In the discuss of thread [1], I hope this patchset continue take a
> > >>>>> step forward (means new version) to support bi-direction test just
> > >>>>> by
> > >> modify config.ini file.
> > >>>>>
> > >>>>
> > >>>> This patch set already exposes all configuration via config.ini. I
> > >>>> didn't follow
> > >> what is missing. For bi-direction, we can better continue discussing
> > >> on that patch.
> > >>>
> > >>> Chengwen, please can you confirm whether you require a new version?
> > >>> Which change exactly is missing?
> > >>
> > >> This patchset is OK with one sub-test only tackle one DMA direction.
> > >>
> > > Thanks for the confirmation.
> > >
> > >> But there is a later patch [1] which will support multiple DMA
> > >> directions within one sub-test.
> > >> it will add a entry "xfer_mode", but I think it complicate the test,
> > >> I prefer we do more in this patchset to support some like
> > >> bi-direction just by modify config.ini, some like this:
> > >>
> > > I think we should discuss about that in bi-directional patch series. This series
> > is self-contained and there is no need to add bi-directional as part of this
> > series. As far as this patch set is concerned, all the options are exposed via
> > config.ini. Can you comment if there is anything missing, assuming that we are
> > taking bi-directional support as a separate feature addition.
> > 
> > I have identified some improvements to the dma-perf app, and I plan to do it
> 
> It is unclear at this point what is the issue that you have with the app or this patch set. This series was first submitted on Aug 10 2023. You had acked v8 on Jan 25 2024. After the patches were acked, there were still review comments on variable renames etc, which were all addressed. The patches had been under review for more than 8 months with very slow progress.
> 
> > in 24.07, so if you don't mind, I will incorporate your commits (keeping your
> > signed-off-by) and modify to the one that I described above, and then send to
> > community (also with my improvements commits).
> 
> I would like to have this series merged first and not pulled into another series. We do have few other features that we would like to add on top. I would assume that you can also add your changes on top. To make contribution easier, isn't it better to accept at least this patch set (as you acked earlier) and then you can continue working on the improvements?
OK, one feature at a time.
Let's work on top of this patchset applied.
^ permalink raw reply	[flat|nested] 79+ messages in thread
* RE: [EXTERNAL] Re: [v11 0/4] PCI Dev and SG copy support
  2024-03-15 13:09                                   ` Thomas Monjalon
@ 2024-03-18  7:32                                     ` Gowrishankar Muthukrishnan
  0 siblings, 0 replies; 79+ messages in thread
From: Gowrishankar Muthukrishnan @ 2024-03-18  7:32 UTC (permalink / raw)
  To: Thomas Monjalon, fengchengwen
  Cc: dev, Anoob Joseph, Cheng Jiang, Kevin Laatz, Bruce Richardson,
	Pavan Nikhilesh Bhagavatula, Amit Prakash Shukla, Jerin Jacob
Hi Thomas and Fengchengwen,
> 
> 13/03/2024 09:22, Gowrishankar Muthukrishnan:
> > Hi Fengchengwen
> >
> > > Hi Gowrishankar,
> > >
> > > On 2024/3/12 20:24, Gowrishankar Muthukrishnan wrote:
> > > > Hi Fengchengwen
> > > >
> > > >>
> > > >> Hi Thomas,
> > > >>
> > > >> On 2024/3/12 17:15, Thomas Monjalon wrote:
> > > >>> 07/03/2024 14:55, Gowrishankar Muthukrishnan:
> > > >>>> Hi Fengchengwen,
> > > >>>>
> > > >>>>>> Waiting for a confirmation that this series is good to go.
> > > >>>>>
> > > >>>>> In the discuss of thread [1], I hope this patchset continue
> > > >>>>> take a step forward (means new version) to support
> > > >>>>> bi-direction test just by
> > > >> modify config.ini file.
> > > >>>>>
> > > >>>>
> > > >>>> This patch set already exposes all configuration via
> > > >>>> config.ini. I didn't follow
> > > >> what is missing. For bi-direction, we can better continue
> > > >> discussing on that patch.
> > > >>>
> > > >>> Chengwen, please can you confirm whether you require a new
> version?
> > > >>> Which change exactly is missing?
> > > >>
> > > >> This patchset is OK with one sub-test only tackle one DMA direction.
> > > >>
> > > > Thanks for the confirmation.
> > > >
> > > >> But there is a later patch [1] which will support multiple DMA
> > > >> directions within one sub-test.
> > > >> it will add a entry "xfer_mode", but I think it complicate the
> > > >> test, I prefer we do more in this patchset to support some like
> > > >> bi-direction just by modify config.ini, some like this:
> > > >>
> > > > I think we should discuss about that in bi-directional patch
> > > > series. This series
> > > is self-contained and there is no need to add bi-directional as part
> > > of this series. As far as this patch set is concerned, all the
> > > options are exposed via config.ini. Can you comment if there is
> > > anything missing, assuming that we are taking bi-directional support as a
> separate feature addition.
> > >
> > > I have identified some improvements to the dma-perf app, and I plan
> > > to do it
> >
> > It is unclear at this point what is the issue that you have with the app or this
> patch set. This series was first submitted on Aug 10 2023. You had acked v8 on
> Jan 25 2024. After the patches were acked, there were still review comments
> on variable renames etc, which were all addressed. The patches had been
> under review for more than 8 months with very slow progress.
> >
> > > in 24.07, so if you don't mind, I will incorporate your commits
> > > (keeping your
> > > signed-off-by) and modify to the one that I described above, and
> > > then send to community (also with my improvements commits).
> >
> > I would like to have this series merged first and not pulled into another
> series. We do have few other features that we would like to add on top. I
> would assume that you can also add your changes on top. To make
> contribution easier, isn't it better to accept at least this patch set (as you acked
> earlier) and then you can continue working on the improvements?
> 
> OK, one feature at a time.
> Let's work on top of this patchset applied.
> 
Thank you both for reviewing this series and accepting it in RC3.
Regards,
Gowrishankar
^ permalink raw reply	[flat|nested] 79+ messages in thread
end of thread, other threads:[~2024-03-18  7:32 UTC | newest]
Thread overview: 79+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-10 10:57 [PATCH v2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2023-08-10 13:01 ` [PATCH v3 0/2] " Gowrishankar Muthukrishnan
2023-08-10 13:01   ` [PATCH v3 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2023-08-23 11:46     ` [EXT] " Pavan Nikhilesh Bhagavatula
2023-08-10 13:01   ` [PATCH v3 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2023-09-21  3:02   ` [PATCH v3 0/2] " Jiang, Cheng1
2023-09-24  9:32   ` [PATCH v4 " Gowrishankar Muthukrishnan
2023-09-24  9:32     ` [PATCH v4 1/2] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2023-09-24  9:32     ` [PATCH v4 2/2] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2023-09-28 21:12       ` Pavan Nikhilesh Bhagavatula
2023-10-26 18:31     ` [PATCH v5 0/4] app/dma-perf: PCI Dev and " Gowrishankar Muthukrishnan
2023-10-26 18:31       ` [PATCH v5 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
2023-11-10  9:03         ` Anoob Joseph
2023-10-26 18:31       ` [PATCH v5 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
2023-11-10  9:04         ` Anoob Joseph
2023-10-26 18:31       ` [PATCH v5 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2023-11-10  9:05         ` Anoob Joseph
2023-10-26 18:31       ` [PATCH v5 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2023-11-10  9:07         ` Anoob Joseph
2023-11-13  4:41       ` [PATCH v6 0/4] PCI Dev and " Gowrishankar Muthukrishnan
2023-11-13  4:41         ` [PATCH v6 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
2023-11-13  4:41         ` [PATCH v6 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
2023-11-13  4:41         ` [PATCH v6 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2023-11-13  4:41         ` [PATCH v6 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2023-11-17 12:15         ` [PATCH v7 0/4] PCI Dev and " Gowrishankar Muthukrishnan
2023-11-17 12:15           ` [PATCH v7 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
2023-11-20  2:54             ` fengchengwen
2023-11-22 12:01               ` [EXT] " Amit Prakash Shukla
2023-11-17 12:15           ` [PATCH v7 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
2023-11-17 12:15           ` [PATCH v7 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2023-11-17 12:15           ` [PATCH v7 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2023-11-22 11:06           ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
2023-11-22 11:06             ` [PATCH v8 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
2023-11-22 11:06             ` [PATCH v8 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
2023-11-23  1:12               ` fengchengwen
2024-02-21  3:26               ` fengchengwen
2024-02-27  9:27                 ` [EXT] " Amit Prakash Shukla
2023-11-22 11:06             ` [PATCH v8 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2023-11-23  1:14               ` fengchengwen
2023-11-22 11:06             ` [PATCH v8 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2024-01-25 12:44               ` fengchengwen
2024-02-21  3:52               ` fengchengwen
2024-02-27 16:09                 ` [EXT] " Gowrishankar Muthukrishnan
2023-12-07 10:11             ` [PATCH v8 0/4] PCI Dev and " Gowrishankar Muthukrishnan
2024-02-05 10:37               ` Gowrishankar Muthukrishnan
2024-02-27 16:00             ` [PATCH v9 " Amit Prakash Shukla
2024-02-27 16:00               ` [PATCH v9 1/4] app/dma-perf: add skip support Amit Prakash Shukla
2024-02-27 16:00               ` [PATCH v9 2/4] app/dma-perf: add PCI device support Amit Prakash Shukla
2024-02-27 16:00               ` [PATCH v9 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
2024-02-27 16:00               ` [PATCH v9 4/4] app/dma-perf: add SG copy support Amit Prakash Shukla
2024-02-27 18:35               ` [PATCH v10 0/4] PCI Dev and " Amit Prakash Shukla
2024-02-27 18:35                 ` [PATCH v10 1/4] app/dma-perf: add skip support Amit Prakash Shukla
2024-02-27 18:35                 ` [PATCH v10 2/4] app/dma-perf: add PCI device support Amit Prakash Shukla
2024-02-27 18:35                 ` [PATCH v10 3/4] app/dma-perf: validate copied memory Amit Prakash Shukla
2024-02-28  8:10                   ` fengchengwen
2024-02-28  9:09                     ` [EXT] " Gowrishankar Muthukrishnan
2024-02-29 13:48                 ` [v11 0/4] PCI Dev and SG copy support Gowrishankar Muthukrishnan
2024-02-29 13:48                   ` [v11 1/4] app/dma-perf: add skip support Gowrishankar Muthukrishnan
2024-02-29 13:48                   ` [v11 2/4] app/dma-perf: add PCI device support Gowrishankar Muthukrishnan
2024-02-29 13:48                   ` [v11 3/4] app/dma-perf: validate copied memory Gowrishankar Muthukrishnan
2024-02-29 13:48                   ` [v11 4/4] app/dma-perf: add SG copy support Gowrishankar Muthukrishnan
2024-03-06 19:50                   ` [v11 0/4] PCI Dev and " Thomas Monjalon
2024-03-07 13:48                     ` fengchengwen
2024-03-07 13:55                       ` [EXTERNAL] " Gowrishankar Muthukrishnan
2024-03-12  9:15                         ` Thomas Monjalon
2024-03-12 12:05                           ` fengchengwen
2024-03-12 12:24                             ` Gowrishankar Muthukrishnan
2024-03-13  7:26                               ` fengchengwen
2024-03-13  8:22                                 ` Gowrishankar Muthukrishnan
2024-03-15  7:30                                   ` Gowrishankar Muthukrishnan
2024-03-15 13:09                                   ` Thomas Monjalon
2024-03-18  7:32                                     ` Gowrishankar Muthukrishnan
2024-03-07 13:48                     ` Gowrishankar Muthukrishnan
2024-02-27 18:56               ` [PATCH v10 4/4] app/dma-perf: add " Amit Prakash Shukla
2024-02-28  9:31                 ` fengchengwen
2024-02-29 13:16                   ` [EXT] " Gowrishankar Muthukrishnan
2024-03-01  2:07                     ` fengchengwen
2024-03-01  8:06                       ` [EXTERNAL] " Gowrishankar Muthukrishnan
2024-03-01  9:45                         ` fengchengwen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).