DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] app/dma-perf: introduce dma-perf application
@ 2023-04-20  7:22 Cheng Jiang
  2023-05-17  6:16 ` [PATCH v2] " Cheng Jiang
                   ` (9 more replies)
  0 siblings, 10 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-04-20  7:22 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 467 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  56 ++++
 app/test-dma-perf/main.c      | 445 ++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  56 ++++
 app/test-dma-perf/meson.build |  17 ++
 6 files changed, 1042 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..514cb2f7b2 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..36e3413bdc
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,467 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+uint16_t dmadev_ids[MAX_WORKER_NB];
+uint32_t nb_dmadevs;
+
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, uint32_t *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %u:\n", lcore_id, dev_id);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %uMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dev_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t nr_buf,
+		__maybe_unused uint32_t buf_size)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id %u\n", dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+
+static int
+config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
+{
+	int16_t dev_id = rte_dma_next_dev(0);
+	uint32_t i;
+
+	nb_dmadevs = 0;
+
+	for (i = 0; i < nb_workers; i++) {
+		if (dev_id == -1)
+			goto end;
+
+		dmadev_ids[i] = dev_id;
+		configure_dmadev_queue(dmadev_ids[i], ring_size);
+		++nb_dmadevs;
+		dev_id = rte_dma_next_dev(dev_id + 1);
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id  = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint16_t test_secs = cfg->test_secs;
+	uint32_t memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dev_id = dmadev_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dev_id, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_dmadevs; i++) {
+			printf("Stopping dmadev %d\n", dmadev_ids[i]);
+			rte_dma_stop(dmadev_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..47bcdd2eb2
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,56 @@
+
+; Supported test types:
+; DMA_MEM_COPY|CPU_MEM_COPY
+
+; Parameters:
+; "mem_size","buf_size","dma_ring_size","kick_batch".
+; "mem_size" means the size of the memory footprint.
+; "buf_size" means the memory size of a single operation.
+; "dma_ring_size" means the dma ring buffer size.
+; "kick_batch" means dma operation batch size.
+
+; Format: variable=first[,last,increment[,ADD|MUL]]
+; ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to control if the cache should be flushed.
+
+; test_seconds is used to control the test time of the whole case.
+
+; worker_threads is used to control the threads number of the test app.
+; It should be less than the core number.
+
+; mpool_iter_step is used to control the buffer continuity.
+
+; Bind DMA to lcore:
+; Specify the "lcore_dma" parameter.
+; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
+; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
+; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
+; their range.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+worker_threads=1
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+worker_threads=1
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..00176db6d6
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,445 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->nb_workers) {
+		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
+			case_id, nb_lcores, case_cfg->nb_workers);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static void
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d\n.", i + 1);
+			exit(1);
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s.\n", case_type);
+			exit(1);
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error\n");
+				break;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error\n");
+				break;
+			} else if (args_nr > 1)
+				nb_vp++;
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			break;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+		test_case->nb_workers = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "worker_threads"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+	}
+
+	rte_cfgfile_close(cfgfile);
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;	// skip the value argument
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		// if the result output file is not configured, let it base on the config file name.
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE)
+			continue;
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n");
+			else
+				printf("Case process unknown terminated.\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..235cb74daf
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct test_configure {
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	uint32_t nb_workers;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
-- 
2.35.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v2] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
@ 2023-05-17  6:16 ` Cheng Jiang
  2023-05-17  7:31 ` [PATCH v3] " Cheng Jiang
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-05-17  6:16 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 471 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  59 ++++
 app/test-dma-perf/main.c      | 567 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 +++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1184 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..514cb2f7b2 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..4e99ab9736
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, uint32_t *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %uMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n", dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	uint32_t memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dma_name = ldm->dma_names[i];
+			worker_params[i]->dev_id = ldm->dma_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..25af5be0c6
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,59 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be greater than 64 normally.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1.
+
+; The format for variables is variable=first[,last,increment[,ADD|MUL]]. ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..b55b09d090
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,567 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	size_t len = strlen(value);
+	char *input = (char *) malloc((len + 1) * sizeof(char));
+	strcpy(input, value);
+	struct lcore_dma_map_t *lcore_dma_map = &(test_case->lcore_dma_map);
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1]);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.35.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
  2023-05-17  6:16 ` [PATCH v2] " Cheng Jiang
@ 2023-05-17  7:31 ` Cheng Jiang
  2023-06-08  5:03 ` [PATCH v4] " Cheng Jiang
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-05-17  7:31 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 471 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  59 ++++
 app/test-dma-perf/main.c      | 567 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 +++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1184 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..514cb2f7b2 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..4e99ab9736
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, uint32_t *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %uMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n", dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	uint32_t memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dma_name = ldm->dma_names[i];
+			worker_params[i]->dev_id = ldm->dma_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..0318da305b
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,59 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be greater than 64 normally.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first[,last,increment[,ADD|MUL]]. ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..b55b09d090
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,567 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	size_t len = strlen(value);
+	char *input = (char *) malloc((len + 1) * sizeof(char));
+	strcpy(input, value);
+	struct lcore_dma_map_t *lcore_dma_map = &(test_case->lcore_dma_map);
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1]);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.35.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v4] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
  2023-05-17  6:16 ` [PATCH v2] " Cheng Jiang
  2023-05-17  7:31 ` [PATCH v3] " Cheng Jiang
@ 2023-06-08  5:03 ` Cheng Jiang
  2023-06-08  8:27   ` Xia, Chenbo
  2023-06-08  8:43 ` [PATCH v5] " Cheng Jiang
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 53+ messages in thread
From: Cheng Jiang @ 2023-06-08  5:03 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 471 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  59 ++++
 app/test-dma-perf/main.c      | 567 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 +++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1184 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..fd5be393ad
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, float memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %.2lfMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n", dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dma_name = ldm->dma_names[i];
+			worker_params[i]->dev_id = ldm->dma_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..0318da305b
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,59 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be greater than 64 normally.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first[,last,increment[,ADD|MUL]]. ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..b55b09d090
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,567 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	size_t len = strlen(value);
+	char *input = (char *) malloc((len + 1) * sizeof(char));
+	strcpy(input, value);
+	struct lcore_dma_map_t *lcore_dma_map = &(test_case->lcore_dma_map);
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1]);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v4] app/dma-perf: introduce dma-perf application
  2023-06-08  5:03 ` [PATCH v4] " Cheng Jiang
@ 2023-06-08  8:27   ` Xia, Chenbo
  2023-06-08  8:38     ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Xia, Chenbo @ 2023-06-08  8:27 UTC (permalink / raw)
  To: Jiang, Cheng1, thomas, Richardson, Bruce, mb
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Thursday, June 8, 2023 1:03 PM
> To: thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>; Jiang,
> Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v4] app/dma-perf: introduce dma-perf application
> 
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>

Some LONG_LINE warning could be fixed if it's friendly for searching
logs.

I didn't look into all details but overall it seems good.

Acked-by: Chenbo Xia <chenbo.xia@intel.com>

> ---
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 471 ++++++++++++++++++++++++++++
>  app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 567 ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1184 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c
>  create mode 100644 app/test-dma-perf/config.ini
>  create mode 100644 app/test-dma-perf/main.c
>  create mode 100644 app/test-dma-perf/main.h
>  create mode 100644 app/test-dma-perf/meson.build
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v4] app/dma-perf: introduce dma-perf application
  2023-06-08  8:27   ` Xia, Chenbo
@ 2023-06-08  8:38     ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-08  8:38 UTC (permalink / raw)
  To: Xia, Chenbo, thomas, Richardson, Bruce, mb
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang

Hi Chenbo,

> -----Original Message-----
> From: Xia, Chenbo <chenbo.xia@intel.com>
> Sent: Thursday, June 8, 2023 4:28 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>; mb@smartsharesystems.com
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: RE: [PATCH v4] app/dma-perf: introduce dma-perf application
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Thursday, June 8, 2023 1:03 PM
> > To: thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>; Jiang,
> > Cheng1 <cheng1.jiang@intel.com>
> > Subject: [PATCH v4] app/dma-perf: introduce dma-perf application
> >
> > There are many high-performance DMA devices supported in DPDK now, and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre-set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 
> Some LONG_LINE warning could be fixed if it's friendly for searching logs.
> 
> I didn't look into all details but overall it seems good.
> 
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>

Sure, I will fix it in the next version.

Thanks a lot.
Cheng

> 
> > ---
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 471 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 567 ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1184 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> > app/test-dma-perf/config.ini  create mode 100644
> > app/test-dma-perf/main.c  create mode 100644 app/test-dma-perf/main.h
> > create mode 100644 app/test-dma-perf/meson.build
> >

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (2 preceding siblings ...)
  2023-06-08  5:03 ` [PATCH v4] " Cheng Jiang
@ 2023-06-08  8:43 ` Cheng Jiang
  2023-06-09 11:44   ` [EXT] " Anoob Joseph
  2023-06-09 14:03   ` Amit Prakash Shukla
  2023-06-13  4:31 ` [PATCH v6] " Cheng Jiang
                   ` (5 subsequent siblings)
  9 siblings, 2 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-06-08  8:43 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  59 ++++
 app/test-dma-perf/main.c      | 569 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 +++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1187 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..39d7b26955
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,472 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, float memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %.2lfMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dma_name = ldm->dma_names[i];
+			worker_params[i]->dev_id = ldm->dma_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..0318da305b
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,59 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be greater than 64 normally.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first[,last,increment[,ADD|MUL]]. ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..9c2b55dec4
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
+			PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	size_t len = strlen(value);
+	char *input = (char *) malloc((len + 1) * sizeof(char));
+	strcpy(input, value);
+	struct lcore_dma_map_t *lcore_dma_map = &(test_case->lcore_dma_map);
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1]);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-06-08  8:43 ` [PATCH v5] " Cheng Jiang
@ 2023-06-09 11:44   ` Anoob Joseph
  2023-06-12  7:40     ` Jiang, Cheng1
  2023-06-09 14:03   ` Amit Prakash Shukla
  1 sibling, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-09 11:44 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Jerin Jacob Kollanukkaran, Vamsi Krishna Attunuru,
	Amit Prakash Shukla, Satha Koteswara Rao Kottidi,
	Gowrishankar Muthukrishnan, Vidya Sagar Velumuri

Hi,

Thanks for adding the app. Few comments inline. Please check.

Thanks,
Anoob

> -----Original Message-----
> From: Cheng Jiang <cheng1.jiang@intel.com>
> Sent: Thursday, June 8, 2023 2:14 PM
> To: thomas@monjalon.net; bruce.richardson@intel.com;
> mb@smartsharesystems.com; chenbo.xia@intel.com
> Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> Cheng Jiang <cheng1.jiang@intel.com>
> Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> External Email
> 
> ----------------------------------------------------------------------
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 569
> ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1187 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build
> 

<snip>

> +
> +/* Configuration of device. */
> +static void
> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> +	uint16_t vchan = 0;
> +	struct rte_dma_info info;
> +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };

[Anoob] Is it possible to use more vchans? The code launches as many threads as the number of dma devices. Instead it should be total number of vchans.

> +	struct rte_dma_vchan_conf qconf = {
> +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> +		.nb_desc = ring_size
> +	};
> +
> +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> +
> +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> +
> +	rte_dma_info_get(dev_id, &info);
> +	if (info.nb_vchans != 1)
> +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> reported on device id. %u\n",
> +				dev_id);
> +
> +	if (rte_dma_start(dev_id) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> +
> +

<snip>

> +static inline int
> +do_dma_mem_copy(void *p)
> +{
> +	uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = worker_params[*para_idx];
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint16_t kick_batch = para->kick_batch;

[Anoob] Some of these variables can be made const. Since this is fast path, might be beneficial doing that way.

> +	uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	int64_t async_cnt = 0;
> +	int nr_cpl = 0;
> +	uint32_t i;
> +	uint32_t poll_cnt = 0;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +			if (unlikely(rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0) {
> +					nr_cpl = rte_dma_completed(dev_id,
> 0, MAX_DMA_CPL_NB,
> +								NULL, NULL);
> +					async_cnt -= nr_cpl;
> +					worker_info->total_cpl += nr_cpl;
> +				}
> +				if (rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0) {
> +					printf("enqueue fail again at %u\n",
> i);
> +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");
> +				}

[Anoob] Only if the API returns -ENOSPC we should retry submitting, right? Other errors should be treated as fatal errors.

Do we need to use rte_dma_burst_capacity() API?

Can't we try something like,

dma_copy:
		ret =  rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]), rte_pktmbuf_iova(dsts[i]), buf_size, 0);
		if (unlikely (ret < 0) {
			if (ret == -ENOSPC) {
				rte_dma_submit(dev_id, 0);
				/* DMA completed & other handling */
				goto dma_copy;
			} else {
				/* Error exit */
			}
		}

			
> +			}
> +			async_cnt++;
> +
> +			if ((async_cnt % kick_batch) == 0) {
> +				rte_dma_submit(dev_id, 0);
> +				/* add a poll to avoid ring full */
> +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +				async_cnt -= nr_cpl;
> +				worker_info->total_cpl += nr_cpl;

[Anoob] Above code can be made as a static inline function so that in cases rte_dma_copy returns -ENOSPC, same static inline can be called.

<snip>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-06-08  8:43 ` [PATCH v5] " Cheng Jiang
  2023-06-09 11:44   ` [EXT] " Anoob Joseph
@ 2023-06-09 14:03   ` Amit Prakash Shukla
  2023-06-12  8:26     ` Jiang, Cheng1
  1 sibling, 1 reply; 53+ messages in thread
From: Amit Prakash Shukla @ 2023-06-09 14:03 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Jerin Jacob Kollanukkaran, Anoob Joseph



> -----Original Message-----
> From: Cheng Jiang <cheng1.jiang@intel.com>
> Sent: Thursday, June 8, 2023 2:14 PM
> To: thomas@monjalon.net; bruce.richardson@intel.com;
> mb@smartsharesystems.com; chenbo.xia@intel.com
> Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> Cheng Jiang <cheng1.jiang@intel.com>
> Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> External Email
> 
> ----------------------------------------------------------------------
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 569
> ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1187 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build
> 

<snip>

> +
> +static inline int
> +do_dma_mem_copy(void *p)
> +{
> +	uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = worker_params[*para_idx];
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint16_t kick_batch = para->kick_batch;
> +	uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	int64_t async_cnt = 0;
> +	int nr_cpl = 0;
> +	uint32_t i;
> +	uint32_t poll_cnt = 0;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +			if (unlikely(rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0) {
> +					nr_cpl = rte_dma_completed(dev_id,
> 0, MAX_DMA_CPL_NB,
> +								NULL, NULL);
> +					async_cnt -= nr_cpl;
> +					worker_info->total_cpl += nr_cpl;
> +				}
> +				if (rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0) {
> +					printf("enqueue fail again at %u\n",
> i);
> +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");

[Amit]: On all success or failure exits, please call rte_dma_stop and rte_dma_close to exit cleanly.

> +				}
> +			}
> +			async_cnt++;
> +
> +			if ((async_cnt % kick_batch) == 0) {
> +				rte_dma_submit(dev_id, 0);
> +				/* add a poll to avoid ring full */
> +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +				async_cnt -= nr_cpl;
> +				worker_info->total_cpl += nr_cpl;
> +			}
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +

<snip>

> +
> +void
> +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> +	uint16_t i;
> +	uint32_t offset;
> +	unsigned int lcore_id = 0;
> +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	uint16_t kick_batch = cfg->kick_batch.cur;
> +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> +	uint16_t nb_workers = ldm->cnt;
> +	uint16_t test_secs = cfg->test_secs;
> +	float memory;
> +	uint32_t avg_cycles = 0;
> +	float mops;
> +	float bandwidth;
> +
> +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +		goto out;
> +
> +	if (is_dma)
> +		if (config_dmadevs(cfg) < 0)
> +			goto out;
> +
> +	if (cfg->cache_flush) {
> +		cache_flush_buf(srcs, buf_size, nr_buf);
> +		cache_flush_buf(dsts, buf_size, nr_buf);
> +		rte_mb();
> +	}
> +
> +	printf("Start testing....\n");
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		lcore_id = ldm->lcores[i];
> +		offset = nr_buf / nb_workers * i;
> +
> +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> lcore_params), 0);
> +		if (!worker_params[i]) {
> +			printf("lcore parameters malloc failure for lcore
> %d\n", lcore_id);
> +			break;
> +		}
> +		if (is_dma) {
> +			worker_params[i]->dma_name = ldm-
> >dma_names[i];
> +			worker_params[i]->dev_id = ldm->dma_ids[i];
> +			worker_params[i]->kick_batch = kick_batch;
> +		}
> +		worker_params[i]->worker_id = i;
> +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> nb_workers);
> +		worker_params[i]->buf_size = buf_size;
> +		worker_params[i]->test_secs = test_secs;
> +		worker_params[i]->srcs = srcs + offset;
> +		worker_params[i]->dsts = dsts + offset;
> +		worker_params[i]->scenario_id = cfg->scenario_id;
> +		worker_params[i]->lcore_id = lcore_id;
> +
> +		if (is_dma)
> +			rte_eal_remote_launch(do_dma_mem_copy, (void
> *)(&i), lcore_id);
> +		else
> +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> *)(&i), lcore_id);
> +	}
> +
> +	while (1) {
> +		bool ready = true;
> +		for (i = 0; i < nb_workers; i++) {
> +			if (worker_params[i]->worker_info.ready_flag ==
> false) {
> +				ready = 0;
> +				break;
> +			}
> +		}
> +		if (ready)
> +			break;
> +	}
> +
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.start_flag = true;
> +
> +	usleep(TEST_WAIT_U_SECOND);
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.test_cpl =
> +worker_params[i]->worker_info.total_cpl;
> +
> +	usleep(test_secs * 1000 * 1000);
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.test_cpl =
> worker_params[i]->worker_info.total_cpl -
> +						worker_params[i]-
> >worker_info.test_cpl;
> +
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.stop_flag = true;
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> +			worker_params[i]->worker_info.test_cpl,
> +			&memory, &avg_cycles, &bandwidth, &mops);
> +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> +					worker_params[i]->dma_name,
> avg_cycles, buf_size,
> +					nr_buf / nb_workers, memory,
> bandwidth, mops, is_dma);
> +	}
> +
> +out:
> +	/* free env */
> +	if (srcs)
> +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> +	if (dsts)
> +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> +
> +	if (src_pool)
> +		rte_mempool_free(src_pool);
> +	if (dst_pool)
> +		rte_mempool_free(dst_pool);
> +
> +	if (is_dma) {
> +		for (i = 0; i < nb_workers; i++) {
> +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> +			rte_dma_stop(ldm->dma_ids[i]);

[Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.

<snip>

> +#endif /* _MAIN_H_ */
> diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> --- /dev/null
> +++ b/app/test-dma-perf/meson.build
> @@ -0,0 +1,17 @@
> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023 Intel
> +Corporation
> +
> +# meson file, for building this app as part of a main DPDK build.
> +
> +if is_windows
> +    build = false
> +    reason = 'not supported on Windows'
> +    subdir_done()
> +endif
> +
> +deps += ['dmadev', 'mbuf', 'cfgfile']
> +
> +sources = files(
> +        'main.c',
> +        'benchmark.c',
> +)
> --
> 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-06-09 11:44   ` [EXT] " Anoob Joseph
@ 2023-06-12  7:40     ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-12  7:40 UTC (permalink / raw)
  To: Anoob Joseph, thomas, Richardson, Bruce, mb, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Jerin Jacob Kollanukkaran, Vamsi Krishna Attunuru,
	Amit Prakash Shukla, Satha Koteswara Rao Kottidi,
	Gowrishankar Muthukrishnan, Vidya Sagar Velumuri

Hi,

Thanks for your comments, the replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Friday, June 9, 2023 7:44 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Satha Koteswara Rao Kottidi
> <skoteshwar@marvell.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>; Vidya Sagar Velumuri
> <vvelumuri@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Thanks for adding the app. Few comments inline. Please check.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Cheng Jiang <cheng1.jiang@intel.com>
> > Sent: Thursday, June 8, 2023 2:14 PM
> > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > mb@smartsharesystems.com; chenbo.xia@intel.com
> > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> > Cheng Jiang <cheng1.jiang@intel.com>
> > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre- set config file. Memory Copy performance test
> are supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 569
> > ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1187 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode
> 100644
> > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> > 100644 app/test-dma-perf/meson.build
> >
> 
> <snip>
> 
> > +
> > +/* Configuration of device. */
> > +static void
> > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > +	uint16_t vchan = 0;
> > +	struct rte_dma_info info;
> > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> 
> [Anoob] Is it possible to use more vchans? The code launches as many
> threads as the number of dma devices. Instead it should be total number of
> vchans.

[Cheng] Really good suggestion. This is feasible, but in the initial stage, we want to keep things simple. Perhaps in the future, we can add a parameter to configure the number of vchans for each device and then launch the corresponding number of threads for each vchan.

> 
> > +	struct rte_dma_vchan_conf qconf = {
> > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > +		.nb_desc = ring_size
> > +	};
> > +
> > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> > +
> > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> > +
> > +	rte_dma_info_get(dev_id, &info);
> > +	if (info.nb_vchans != 1)
> > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > reported on device id. %u\n",
> > +				dev_id);
> > +
> > +	if (rte_dma_start(dev_id) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > +
> > +
> 
> <snip>
> 
> > +static inline int
> > +do_dma_mem_copy(void *p)
> > +{
> > +	uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	uint16_t dev_id = para->dev_id;
> > +	uint32_t nr_buf = para->nr_buf;
> > +	uint16_t kick_batch = para->kick_batch;
> 
> [Anoob] Some of these variables can be made const. Since this is fast path,
> might be beneficial doing that way.

[Cheng] Good idea, I'll improve it in the next version.

> 
> > +	uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t i;
> > +	uint32_t poll_cnt = 0;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > 0) {
> > +					nr_cpl = rte_dma_completed(dev_id,
> > 0, MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +					worker_info->total_cpl += nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> > i);
> > +					printf("space:%d\n",
> > rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA
> > enqueue failed\n");
> > +				}
> 
> [Anoob] Only if the API returns -ENOSPC we should retry submitting, right?
> Other errors should be treated as fatal errors.
> 
> Do we need to use rte_dma_burst_capacity() API?
> 
> Can't we try something like,
> 
> dma_copy:
> 		ret =  rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]),
> rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> 		if (unlikely (ret < 0) {
> 			if (ret == -ENOSPC) {
> 				rte_dma_submit(dev_id, 0);
> 				/* DMA completed & other handling */
> 				goto dma_copy;
> 			} else {
> 				/* Error exit */
> 			}
> 		}
> 
> 

[Cheng] Good idea, we don't have to check the capacity explicitly. I think your implementation is more clear, thanks. I will fix it in the next version.

> > +			}
> > +			async_cnt++;
> > +
> > +			if ((async_cnt % kick_batch) == 0) {
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +				worker_info->total_cpl += nr_cpl;
> 
> [Anoob] Above code can be made as a static inline function so that in cases
> rte_dma_copy returns -ENOSPC, same static inline can be called.
> 

[Cheng] sure, got it. Thanks!

> <snip>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-06-09 14:03   ` Amit Prakash Shukla
@ 2023-06-12  8:26     ` Jiang, Cheng1
  2023-06-13  4:51       ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-12  8:26 UTC (permalink / raw)
  To: Amit Prakash Shukla, thomas, Richardson, Bruce, mb, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Jerin Jacob Kollanukkaran, Anoob Joseph

Hi,

Thanks for your comments, replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> Sent: Friday, June 9, 2023 10:03 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> 
> 
> > -----Original Message-----
> > From: Cheng Jiang <cheng1.jiang@intel.com>
> > Sent: Thursday, June 8, 2023 2:14 PM
> > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > mb@smartsharesystems.com; chenbo.xia@intel.com
> > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> > Cheng Jiang <cheng1.jiang@intel.com>
> > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre- set config file. Memory Copy performance test
> are supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 569
> > ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1187 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode
> 100644
> > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> > 100644 app/test-dma-perf/meson.build
> >
> 
> <snip>
> 
> > +
> > +static inline int
> > +do_dma_mem_copy(void *p)
> > +{
> > +	uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	uint16_t dev_id = para->dev_id;
> > +	uint32_t nr_buf = para->nr_buf;
> > +	uint16_t kick_batch = para->kick_batch;
> > +	uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t i;
> > +	uint32_t poll_cnt = 0;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > 0) {
> > +					nr_cpl = rte_dma_completed(dev_id,
> > 0, MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +					worker_info->total_cpl += nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> > i);
> > +					printf("space:%d\n",
> > rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA
> > enqueue failed\n");
> 
> [Amit]: On all success or failure exits, please call rte_dma_stop and
> rte_dma_close to exit cleanly.

[Cheng] Got it. Thanks, I'll fix it in the next version.

> 
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			if ((async_cnt % kick_batch) == 0) {
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +				worker_info->total_cpl += nr_cpl;
> > +			}
> > +		}
> > +
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	rte_dma_submit(dev_id, 0);
> > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +		async_cnt -= nr_cpl;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> 
> <snip>
> 
> > +
> > +void
> > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > +	uint16_t i;
> > +	uint32_t offset;
> > +	unsigned int lcore_id = 0;
> > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> > (cfg->buf_size.cur * 2);
> > +	uint16_t nb_workers = ldm->cnt;
> > +	uint16_t test_secs = cfg->test_secs;
> > +	float memory;
> > +	uint32_t avg_cycles = 0;
> > +	float mops;
> > +	float bandwidth;
> > +
> > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +		goto out;
> > +
> > +	if (is_dma)
> > +		if (config_dmadevs(cfg) < 0)
> > +			goto out;
> > +
> > +	if (cfg->cache_flush) {
> > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > +		rte_mb();
> > +	}
> > +
> > +	printf("Start testing....\n");
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		lcore_id = ldm->lcores[i];
> > +		offset = nr_buf / nb_workers * i;
> > +
> > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > lcore_params), 0);
> > +		if (!worker_params[i]) {
> > +			printf("lcore parameters malloc failure for lcore
> > %d\n", lcore_id);
> > +			break;
> > +		}
> > +		if (is_dma) {
> > +			worker_params[i]->dma_name = ldm-
> > >dma_names[i];
> > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > +			worker_params[i]->kick_batch = kick_batch;
> > +		}
> > +		worker_params[i]->worker_id = i;
> > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > nb_workers);
> > +		worker_params[i]->buf_size = buf_size;
> > +		worker_params[i]->test_secs = test_secs;
> > +		worker_params[i]->srcs = srcs + offset;
> > +		worker_params[i]->dsts = dsts + offset;
> > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > +		worker_params[i]->lcore_id = lcore_id;
> > +
> > +		if (is_dma)
> > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > *)(&i), lcore_id);
> > +		else
> > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > *)(&i), lcore_id);
> > +	}
> > +
> > +	while (1) {
> > +		bool ready = true;
> > +		for (i = 0; i < nb_workers; i++) {
> > +			if (worker_params[i]->worker_info.ready_flag ==
> > false) {
> > +				ready = 0;
> > +				break;
> > +			}
> > +		}
> > +		if (ready)
> > +			break;
> > +	}
> > +
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.start_flag = true;
> > +
> > +	usleep(TEST_WAIT_U_SECOND);
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.test_cpl =
> > +worker_params[i]->worker_info.total_cpl;
> > +
> > +	usleep(test_secs * 1000 * 1000);
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.test_cpl =
> > worker_params[i]->worker_info.total_cpl -
> > +						worker_params[i]-
> > >worker_info.test_cpl;
> > +
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.stop_flag = true;
> > +
> > +	rte_eal_mp_wait_lcore();
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > +			worker_params[i]->worker_info.test_cpl,
> > +			&memory, &avg_cycles, &bandwidth, &mops);
> > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > +					worker_params[i]->dma_name,
> > avg_cycles, buf_size,
> > +					nr_buf / nb_workers, memory,
> > bandwidth, mops, is_dma);
> > +	}
> > +
> > +out:
> > +	/* free env */
> > +	if (srcs)
> > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > +	if (dsts)
> > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > +
> > +	if (src_pool)
> > +		rte_mempool_free(src_pool);
> > +	if (dst_pool)
> > +		rte_mempool_free(dst_pool);
> > +
> > +	if (is_dma) {
> > +		for (i = 0; i < nb_workers; i++) {
> > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > +			rte_dma_stop(ldm->dma_ids[i]);
> 
> [Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.
> 

[Cheng] Sure, I'll fix it in the next version.

> <snip>
> 
> > +#endif /* _MAIN_H_ */
> > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > --- /dev/null
> > +++ b/app/test-dma-perf/meson.build
> > @@ -0,0 +1,17 @@
> > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > +Intel Corporation
> > +
> > +# meson file, for building this app as part of a main DPDK build.
> > +
> > +if is_windows
> > +    build = false
> > +    reason = 'not supported on Windows'
> > +    subdir_done()
> > +endif
> > +
> > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > +
> > +sources = files(
> > +        'main.c',
> > +        'benchmark.c',
> > +)
> > --
> > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (3 preceding siblings ...)
  2023-06-08  8:43 ` [PATCH v5] " Cheng Jiang
@ 2023-06-13  4:31 ` Cheng Jiang
  2023-06-13 12:55   ` huangdengdui
  2023-06-15  5:21   ` [EXT] " Anoob Joseph
  2023-06-18 12:26 ` [PATCH v7] " Cheng Jiang
                   ` (4 subsequent siblings)
  9 siblings, 2 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-06-13  4:31 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v6:
  improved code based on Anoob's comments;
  fixed some code structure issues;
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 477 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  59 ++++
 app/test-dma-perf/main.c      | 569 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 +++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1192 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..bc1ca82297
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, float memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %.2lfMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+
+static inline void
+do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
+			volatile struct worker_info *worker_info)
+{
+	int ret;
+	uint16_t nr_cpl;
+
+	ret = rte_dma_submit(dev_id, 0);
+	if (ret < 0) {
+		rte_dma_stop(dev_id);
+		rte_dma_close(dev_id);
+		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
+	}
+
+	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+	*async_cnt -= nr_cpl;
+	worker_info->total_cpl += nr_cpl;
+}
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	const uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]),
+				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else {
+					/* Error exit */
+					rte_dma_stop(dev_id);
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	const uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint32_t nr_buf = para->nr_buf;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dma_name = ldm->dma_names[i];
+			worker_params[i]->dev_id = ldm->dma_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..2fd9c3c387
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,59 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be greater than 64 normally.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first,last,increment,ADD|MUL.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..d65655b87b
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
+			PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else
+			break;
+	}
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	size_t len = strlen(value);
+	char *input = (char *) malloc((len + 1) * sizeof(char));
+	strcpy(input, value);
+	struct lcore_dma_map_t *lcore_dma_map = &(test_case->lcore_dma_map);
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1]);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr != 1 && args_nr != 4)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+
+	if (args_nr == 4) {
+		entry->last = (uint32_t)atoi(args[1]);
+		entry->incr = (uint32_t)atoi(args[2]);
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s.\n", args[3]);
+			args_nr = -1;
+		}
+	} else {
+		entry->op = OP_NONE;
+		entry->last = 0;
+		entry->incr = 0;
+	}
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-06-12  8:26     ` Jiang, Cheng1
@ 2023-06-13  4:51       ` Jiang, Cheng1
  2023-06-13  7:34         ` Amit Prakash Shukla
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-13  4:51 UTC (permalink / raw)
  To: Amit Prakash Shukla, thomas, Richardson, Bruce, mb, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Jerin Jacob Kollanukkaran, Anoob Joseph

Hi,

Replies are inline.

> -----Original Message-----
> From: Jiang, Cheng1
> Sent: Monday, June 12, 2023 4:27 PM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>;
> thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <Chenbo.Xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> <Xuan.Ding@intel.com>; Ma, WenwuX <WenwuX.Ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Thanks for your comments, replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Amit Prakash Shukla <amitprakashs@marvell.com>
> > Sent: Friday, June 9, 2023 10:03 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>; Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> > <anoobj@marvell.com>
> > Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > application
> >
> >
> >
> > > -----Original Message-----
> > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > Sent: Thursday, June 8, 2023 2:14 PM
> > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > mb@smartsharesystems.com; chenbo.xia@intel.com
> > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> xingguang.he@intel.com;
> > > Cheng Jiang <cheng1.jiang@intel.com>
> > > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > -- There are many high-performance DMA devices supported in DPDK
> > > now,
> > and
> > > these DMA devices can also be integrated into other modules of DPDK
> > > as accelerators, such as Vhost. Before integrating DMA into
> > > applications, developers need to know the performance of these DMA
> > > devices in various scenarios and the performance of CPUs in the same
> > > scenario, such as different buffer lengths. Only in this way can we
> > > know the target performance of the application accelerated by using
> > > them. This patch introduces a high-performance testing tool, which
> > > supports comparing the performance of CPU and DMA in different
> > > scenarios automatically with a pre- set config file. Memory Copy
> > > performance test
> > are supported for now.
> > >
> > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > ---
> > > v5:
> > >   fixed some LONG_LINE warnings;
> > > v4:
> > >   fixed inaccuracy of the memory footprint display;
> > > v3:
> > >   fixed some typos;
> > > v2:
> > >   added lcore/dmadev designation;
> > >   added error case process;
> > >   removed worker_threads parameter from config.ini;
> > >   improved the logs;
> > >   improved config file;
> > >
> > >  app/meson.build               |   1 +
> > >  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> > > app/test-dma-perf/config.ini  |  59 ++++
> > >  app/test-dma-perf/main.c      | 569
> > > ++++++++++++++++++++++++++++++++++
> > >  app/test-dma-perf/main.h      |  69 +++++
> > >  app/test-dma-perf/meson.build |  17 +
> > >  6 files changed, 1187 insertions(+)  create mode 100644
> > > app/test-dma-perf/benchmark.c  create mode
> > 100644
> > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > mode
> > > 100644 app/test-dma-perf/meson.build
> > >
> >
> > <snip>
> >
> > > +
> > > +static inline int
> > > +do_dma_mem_copy(void *p)
> > > +{
> > > +	uint16_t *para_idx = (uint16_t *)p;
> > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > +	uint16_t dev_id = para->dev_id;
> > > +	uint32_t nr_buf = para->nr_buf;
> > > +	uint16_t kick_batch = para->kick_batch;
> > > +	uint32_t buf_size = para->buf_size;
> > > +	struct rte_mbuf **srcs = para->srcs;
> > > +	struct rte_mbuf **dsts = para->dsts;
> > > +	int64_t async_cnt = 0;
> > > +	int nr_cpl = 0;
> > > +	uint32_t i;
> > > +	uint32_t poll_cnt = 0;
> > > +
> > > +	worker_info->stop_flag = false;
> > > +	worker_info->ready_flag = true;
> > > +
> > > +	while (!worker_info->start_flag)
> > > +		;
> > > +
> > > +	while (1) {
> > > +		for (i = 0; i < nr_buf; i++) {
> > > +			if (unlikely(rte_dma_copy(dev_id,
> > > +						0,
> > > +						rte_pktmbuf_iova(srcs[i]),
> > > +						rte_pktmbuf_iova(dsts[i]),
> > > +						buf_size,
> > > +						0) < 0)) {
> > > +				rte_dma_submit(dev_id, 0);
> > > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > > 0) {
> > > +					nr_cpl = rte_dma_completed(dev_id,
> > > 0, MAX_DMA_CPL_NB,
> > > +								NULL, NULL);
> > > +					async_cnt -= nr_cpl;
> > > +					worker_info->total_cpl += nr_cpl;
> > > +				}
> > > +				if (rte_dma_copy(dev_id,
> > > +						0,
> > > +						rte_pktmbuf_iova(srcs[i]),
> > > +						rte_pktmbuf_iova(dsts[i]),
> > > +						buf_size,
> > > +						0) < 0) {
> > > +					printf("enqueue fail again at %u\n",
> > > i);
> > > +					printf("space:%d\n",
> > > rte_dma_burst_capacity(dev_id, 0));
> > > +					rte_exit(EXIT_FAILURE, "DMA
> > > enqueue failed\n");
> >
> > [Amit]: On all success or failure exits, please call rte_dma_stop and
> > rte_dma_close to exit cleanly.
> 
> [Cheng] Got it. Thanks, I'll fix it in the next version.

[Cheng] Hi, I take a look into it, and find out rte_exit() will call rte_dma_close(), so I think there is no need to do it explicitly here. What do you think?

> 
> >
> > > +				}
> > > +			}
> > > +			async_cnt++;
> > > +
> > > +			if ((async_cnt % kick_batch) == 0) {
> > > +				rte_dma_submit(dev_id, 0);
> > > +				/* add a poll to avoid ring full */
> > > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > +				async_cnt -= nr_cpl;
> > > +				worker_info->total_cpl += nr_cpl;
> > > +			}
> > > +		}
> > > +
> > > +		if (worker_info->stop_flag)
> > > +			break;
> > > +	}
> > > +
> > > +	rte_dma_submit(dev_id, 0);
> > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > +		async_cnt -= nr_cpl;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> >
> > <snip>
> >
> > > +
> > > +void
> > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > +	uint16_t i;
> > > +	uint32_t offset;
> > > +	unsigned int lcore_id = 0;
> > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024)
> > > +/
> > > (cfg->buf_size.cur * 2);
> > > +	uint16_t nb_workers = ldm->cnt;
> > > +	uint16_t test_secs = cfg->test_secs;
> > > +	float memory;
> > > +	uint32_t avg_cycles = 0;
> > > +	float mops;
> > > +	float bandwidth;
> > > +
> > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > +		goto out;
> > > +
> > > +	if (is_dma)
> > > +		if (config_dmadevs(cfg) < 0)
> > > +			goto out;
> > > +
> > > +	if (cfg->cache_flush) {
> > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > +		rte_mb();
> > > +	}
> > > +
> > > +	printf("Start testing....\n");
> > > +
> > > +	for (i = 0; i < nb_workers; i++) {
> > > +		lcore_id = ldm->lcores[i];
> > > +		offset = nr_buf / nb_workers * i;
> > > +
> > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > lcore_params), 0);
> > > +		if (!worker_params[i]) {
> > > +			printf("lcore parameters malloc failure for lcore
> > > %d\n", lcore_id);
> > > +			break;
> > > +		}
> > > +		if (is_dma) {
> > > +			worker_params[i]->dma_name = ldm-
> > > >dma_names[i];
> > > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > > +			worker_params[i]->kick_batch = kick_batch;
> > > +		}
> > > +		worker_params[i]->worker_id = i;
> > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > nb_workers);
> > > +		worker_params[i]->buf_size = buf_size;
> > > +		worker_params[i]->test_secs = test_secs;
> > > +		worker_params[i]->srcs = srcs + offset;
> > > +		worker_params[i]->dsts = dsts + offset;
> > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > +		worker_params[i]->lcore_id = lcore_id;
> > > +
> > > +		if (is_dma)
> > > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > > *)(&i), lcore_id);
> > > +		else
> > > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > > *)(&i), lcore_id);
> > > +	}
> > > +
> > > +	while (1) {
> > > +		bool ready = true;
> > > +		for (i = 0; i < nb_workers; i++) {
> > > +			if (worker_params[i]->worker_info.ready_flag ==
> > > false) {
> > > +				ready = 0;
> > > +				break;
> > > +			}
> > > +		}
> > > +		if (ready)
> > > +			break;
> > > +	}
> > > +
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.start_flag = true;
> > > +
> > > +	usleep(TEST_WAIT_U_SECOND);
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.test_cpl =
> > > +worker_params[i]->worker_info.total_cpl;
> > > +
> > > +	usleep(test_secs * 1000 * 1000);
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.test_cpl =
> > > worker_params[i]->worker_info.total_cpl -
> > > +						worker_params[i]-
> > > >worker_info.test_cpl;
> > > +
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.stop_flag = true;
> > > +
> > > +	rte_eal_mp_wait_lcore();
> > > +
> > > +	for (i = 0; i < nb_workers; i++) {
> > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > +			worker_params[i]->worker_info.test_cpl,
> > > +			&memory, &avg_cycles, &bandwidth, &mops);
> > > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > > +					worker_params[i]->dma_name,
> > > avg_cycles, buf_size,
> > > +					nr_buf / nb_workers, memory,
> > > bandwidth, mops, is_dma);
> > > +	}
> > > +
> > > +out:
> > > +	/* free env */
> > > +	if (srcs)
> > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > +	if (dsts)
> > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > +
> > > +	if (src_pool)
> > > +		rte_mempool_free(src_pool);
> > > +	if (dst_pool)
> > > +		rte_mempool_free(dst_pool);
> > > +
> > > +	if (is_dma) {
> > > +		for (i = 0; i < nb_workers; i++) {
> > > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > > +			rte_dma_stop(ldm->dma_ids[i]);
> >
> > [Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.
> >
> 
> [Cheng] Sure, I'll fix it in the next version.

[Cheng] We are not planning to exit here, we are just going to do the next test, so we shouldn't call the rte_dma_close(). Just clarify. And when we finished all the test, we will call the rte_eal_cleanup() which will call the rte_dma_close(). Thanks!

> 
> > <snip>
> >
> > > +#endif /* _MAIN_H_ */
> > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/meson.build
> > > @@ -0,0 +1,17 @@
> > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > +Intel Corporation
> > > +
> > > +# meson file, for building this app as part of a main DPDK build.
> > > +
> > > +if is_windows
> > > +    build = false
> > > +    reason = 'not supported on Windows'
> > > +    subdir_done()
> > > +endif
> > > +
> > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > +
> > > +sources = files(
> > > +        'main.c',
> > > +        'benchmark.c',
> > > +)
> > > --
> > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
  2023-06-13  4:51       ` Jiang, Cheng1
@ 2023-06-13  7:34         ` Amit Prakash Shukla
  0 siblings, 0 replies; 53+ messages in thread
From: Amit Prakash Shukla @ 2023-06-13  7:34 UTC (permalink / raw)
  To: Jiang, Cheng1, thomas, Richardson, Bruce, mb, Xia, Chenbo
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Jerin Jacob Kollanukkaran, Anoob Joseph

Hi Cheng,

My replies are inline.

Thanks,
Amit  Shukla

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Tuesday, June 13, 2023 10:21 AM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>;
> thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Replies are inline.
> 
> > -----Original Message-----
> > From: Jiang, Cheng1
> > Sent: Monday, June 12, 2023 4:27 PM
> > To: Amit Prakash Shukla <amitprakashs@marvell.com>;
> > thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <Chenbo.Xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> > <Xuan.Ding@intel.com>; Ma, WenwuX <WenwuX.Ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> > <anoobj@marvell.com>
> > Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi,
> >
> > Thanks for your comments, replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Amit Prakash Shukla <amitprakashs@marvell.com>
> > > Sent: Friday, June 9, 2023 10:03 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> > YuanX
> > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> > > Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> > > <anoobj@marvell.com>
> > > Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > > application
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Sent: Thursday, June 8, 2023 2:14 PM
> > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > mb@smartsharesystems.com; chenbo.xia@intel.com
> > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > xingguang.he@intel.com;
> > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > External Email
> > > >
> > > > ------------------------------------------------------------------
> > > > --
> > > > -- There are many high-performance DMA devices supported in DPDK
> > > > now,
> > > and
> > > > these DMA devices can also be integrated into other modules of
> > > > DPDK as accelerators, such as Vhost. Before integrating DMA into
> > > > applications, developers need to know the performance of these DMA
> > > > devices in various scenarios and the performance of CPUs in the
> > > > same scenario, such as different buffer lengths. Only in this way
> > > > can we know the target performance of the application accelerated
> > > > by using them. This patch introduces a high-performance testing
> > > > tool, which supports comparing the performance of CPU and DMA in
> > > > different scenarios automatically with a pre- set config file.
> > > > Memory Copy performance test
> > > are supported for now.
> > > >
> > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > ---
> > > > v5:
> > > >   fixed some LONG_LINE warnings;
> > > > v4:
> > > >   fixed inaccuracy of the memory footprint display;
> > > > v3:
> > > >   fixed some typos;
> > > > v2:
> > > >   added lcore/dmadev designation;
> > > >   added error case process;
> > > >   removed worker_threads parameter from config.ini;
> > > >   improved the logs;
> > > >   improved config file;
> > > >
> > > >  app/meson.build               |   1 +
> > > >  app/test-dma-perf/benchmark.c | 472
> ++++++++++++++++++++++++++++
> > > > app/test-dma-perf/config.ini  |  59 ++++
> > > >  app/test-dma-perf/main.c      | 569
> > > > ++++++++++++++++++++++++++++++++++
> > > >  app/test-dma-perf/main.h      |  69 +++++
> > > >  app/test-dma-perf/meson.build |  17 +
> > > >  6 files changed, 1187 insertions(+)  create mode 100644
> > > > app/test-dma-perf/benchmark.c  create mode
> > > 100644
> > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > > mode
> > > > 100644 app/test-dma-perf/meson.build
> > > >
> > >
> > > <snip>
> > >
> > > > +
> > > > +static inline int
> > > > +do_dma_mem_copy(void *p)
> > > > +{
> > > > +	uint16_t *para_idx = (uint16_t *)p;
> > > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > > +	uint16_t dev_id = para->dev_id;
> > > > +	uint32_t nr_buf = para->nr_buf;
> > > > +	uint16_t kick_batch = para->kick_batch;
> > > > +	uint32_t buf_size = para->buf_size;
> > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > +	int64_t async_cnt = 0;
> > > > +	int nr_cpl = 0;
> > > > +	uint32_t i;
> > > > +	uint32_t poll_cnt = 0;
> > > > +
> > > > +	worker_info->stop_flag = false;
> > > > +	worker_info->ready_flag = true;
> > > > +
> > > > +	while (!worker_info->start_flag)
> > > > +		;
> > > > +
> > > > +	while (1) {
> > > > +		for (i = 0; i < nr_buf; i++) {
> > > > +			if (unlikely(rte_dma_copy(dev_id,
> > > > +						0,
> > > > +						rte_pktmbuf_iova(srcs[i]),
> > > > +						rte_pktmbuf_iova(dsts[i]),
> > > > +						buf_size,
> > > > +						0) < 0)) {
> > > > +				rte_dma_submit(dev_id, 0);
> > > > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > > > 0) {
> > > > +					nr_cpl = rte_dma_completed(dev_id,
> > > > 0, MAX_DMA_CPL_NB,
> > > > +								NULL, NULL);
> > > > +					async_cnt -= nr_cpl;
> > > > +					worker_info->total_cpl += nr_cpl;
> > > > +				}
> > > > +				if (rte_dma_copy(dev_id,
> > > > +						0,
> > > > +						rte_pktmbuf_iova(srcs[i]),
> > > > +						rte_pktmbuf_iova(dsts[i]),
> > > > +						buf_size,
> > > > +						0) < 0) {
> > > > +					printf("enqueue fail again at %u\n",
> > > > i);
> > > > +					printf("space:%d\n",
> > > > rte_dma_burst_capacity(dev_id, 0));
> > > > +					rte_exit(EXIT_FAILURE, "DMA
> > > > enqueue failed\n");
> > >
> > > [Amit]: On all success or failure exits, please call rte_dma_stop
> > > and rte_dma_close to exit cleanly.
> >
> > [Cheng] Got it. Thanks, I'll fix it in the next version.
> 
> [Cheng] Hi, I take a look into it, and find out rte_exit() will call
> rte_dma_close(), so I think there is no need to do it explicitly here. What do
> you think?

[Amit]: I agree, not required to explicitly call rte_dma_close here.

> 
> >
> > >
> > > > +				}
> > > > +			}
> > > > +			async_cnt++;
> > > > +
> > > > +			if ((async_cnt % kick_batch) == 0) {
> > > > +				rte_dma_submit(dev_id, 0);
> > > > +				/* add a poll to avoid ring full */
> > > > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +				async_cnt -= nr_cpl;
> > > > +				worker_info->total_cpl += nr_cpl;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		if (worker_info->stop_flag)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	rte_dma_submit(dev_id, 0);
> > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +		async_cnt -= nr_cpl;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > >
> > > <snip>
> > >
> > > > +
> > > > +void
> > > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > > +	uint16_t i;
> > > > +	uint32_t offset;
> > > > +	unsigned int lcore_id = 0;
> > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > +1024) /
> > > > (cfg->buf_size.cur * 2);
> > > > +	uint16_t nb_workers = ldm->cnt;
> > > > +	uint16_t test_secs = cfg->test_secs;
> > > > +	float memory;
> > > > +	uint32_t avg_cycles = 0;
> > > > +	float mops;
> > > > +	float bandwidth;
> > > > +
> > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > +		goto out;
> > > > +
> > > > +	if (is_dma)
> > > > +		if (config_dmadevs(cfg) < 0)
> > > > +			goto out;
> > > > +
> > > > +	if (cfg->cache_flush) {
> > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > +		rte_mb();
> > > > +	}
> > > > +
> > > > +	printf("Start testing....\n");
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++) {
> > > > +		lcore_id = ldm->lcores[i];
> > > > +		offset = nr_buf / nb_workers * i;
> > > > +
> > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > lcore_params), 0);
> > > > +		if (!worker_params[i]) {
> > > > +			printf("lcore parameters malloc failure for lcore
> > > > %d\n", lcore_id);
> > > > +			break;
> > > > +		}
> > > > +		if (is_dma) {
> > > > +			worker_params[i]->dma_name = ldm-
> > > > >dma_names[i];
> > > > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > +		}
> > > > +		worker_params[i]->worker_id = i;
> > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > nb_workers);
> > > > +		worker_params[i]->buf_size = buf_size;
> > > > +		worker_params[i]->test_secs = test_secs;
> > > > +		worker_params[i]->srcs = srcs + offset;
> > > > +		worker_params[i]->dsts = dsts + offset;
> > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > +
> > > > +		if (is_dma)
> > > > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > *)(&i), lcore_id);
> > > > +		else
> > > > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > > > *)(&i), lcore_id);
> > > > +	}
> > > > +
> > > > +	while (1) {
> > > > +		bool ready = true;
> > > > +		for (i = 0; i < nb_workers; i++) {
> > > > +			if (worker_params[i]->worker_info.ready_flag ==
> > > > false) {
> > > > +				ready = 0;
> > > > +				break;
> > > > +			}
> > > > +		}
> > > > +		if (ready)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > +
> > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.test_cpl =
> > > > +worker_params[i]->worker_info.total_cpl;
> > > > +
> > > > +	usleep(test_secs * 1000 * 1000);
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.test_cpl =
> > > > worker_params[i]->worker_info.total_cpl -
> > > > +						worker_params[i]-
> > > > >worker_info.test_cpl;
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > +
> > > > +	rte_eal_mp_wait_lcore();
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++) {
> > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > +			worker_params[i]->worker_info.test_cpl,
> > > > +			&memory, &avg_cycles, &bandwidth, &mops);
> > > > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > > > +					worker_params[i]->dma_name,
> > > > avg_cycles, buf_size,
> > > > +					nr_buf / nb_workers, memory,
> > > > bandwidth, mops, is_dma);
> > > > +	}
> > > > +
> > > > +out:
> > > > +	/* free env */
> > > > +	if (srcs)
> > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > +	if (dsts)
> > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > +
> > > > +	if (src_pool)
> > > > +		rte_mempool_free(src_pool);
> > > > +	if (dst_pool)
> > > > +		rte_mempool_free(dst_pool);
> > > > +
> > > > +	if (is_dma) {
> > > > +		for (i = 0; i < nb_workers; i++) {
> > > > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > >
> > > [Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.
> > >
> >
> > [Cheng] Sure, I'll fix it in the next version.
> 
> [Cheng] We are not planning to exit here, we are just going to do the next
> test, so we shouldn't call the rte_dma_close(). Just clarify. And when we
> finished all the test, we will call the rte_eal_cleanup() which will call the
> rte_dma_close(). Thanks!

[Amit]: Sure, thanks.

> 
> >
> > > <snip>
> > >
> > > > +#endif /* _MAIN_H_ */
> > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/meson.build
> > > > @@ -0,0 +1,17 @@
> > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > > +Intel Corporation
> > > > +
> > > > +# meson file, for building this app as part of a main DPDK build.
> > > > +
> > > > +if is_windows
> > > > +    build = false
> > > > +    reason = 'not supported on Windows'
> > > > +    subdir_done()
> > > > +endif
> > > > +
> > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > +
> > > > +sources = files(
> > > > +        'main.c',
> > > > +        'benchmark.c',
> > > > +)
> > > > --
> > > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-13  4:31 ` [PATCH v6] " Cheng Jiang
@ 2023-06-13 12:55   ` huangdengdui
  2023-06-14  6:40     ` Jiang, Cheng1
  2023-06-15  5:21   ` [EXT] " Anoob Joseph
  1 sibling, 1 reply; 53+ messages in thread
From: huangdengdui @ 2023-06-13 12:55 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia,
	amitprakashs, anoobj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he

Hi Cheng,

Few comments inline. Please check.

Thanks,
Dengdui

On 2023/6/13 12:31, Cheng Jiang wrote:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v6:
>   improved code based on Anoob's comments;
>   fixed some code structure issues;
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 477 ++++++++++++++++++++++++++++
>  app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 569 ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1192 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c
>  create mode 100644 app/test-dma-perf/config.ini
>  create mode 100644 app/test-dma-perf/main.c
>  create mode 100644 app/test-dma-perf/main.h
>  create mode 100644 app/test-dma-perf/meson.build
> 
> diff --git a/app/meson.build b/app/meson.build
> index 74d2420f67..4fc1a83eba 100644
> --- a/app/meson.build
> +++ b/app/meson.build
> @@ -19,6 +19,7 @@ apps = [
>          'test-cmdline',
>          'test-compress-perf',
>          'test-crypto-perf',
> +        'test-dma-perf',
>          'test-eventdev',
>          'test-fib',
>          'test-flow-perf',
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> new file mode 100644
> index 0000000000..bc1ca82297
> --- /dev/null
> +++ b/app/test-dma-perf/benchmark.c
> @@ -0,0 +1,477 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Intel Corporation
> + */
> +

<snip>

> +static inline int
> +__rte_format_printf(3, 4)
> +print_err(const char *func, int lineno, const char *format, ...)
> +{
> +	va_list ap;
> +	int ret;
> +
> +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> +	va_start(ap, format);
> +	ret += vfprintf(stderr, format, ap);
> +	va_end(ap);
> +
> +	return ret;
> +}
> +
> +static inline void
> +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
> +				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
> +				float *bandwidth, float *mops)
> +{
> +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
> +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
> +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;

The value of ave_cycle may be 0.

*mops = (float)(total_cnt / test_secs) / 1000000; ?

> +}
> +
> +static void
> +output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
> +			uint32_t buf_size, uint32_t nr_buf, float memory,
> +			float bandwidth, float mops, bool is_dma)
> +{
> +	if (is_dma)
> +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> +	else
> +		printf("lcore %u\n", lcore_id);
> +
> +	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> +			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
> +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
> +
> +	if (is_dma)
> +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
> +			scenario_id, lcore_id, dma_name, buf_size,
> +			nr_buf, memory, ave_cycle, bandwidth, mops);
> +	else
> +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
> +			scenario_id, lcore_id, buf_size,
> +			nr_buf, memory, ave_cycle, bandwidth, mops);
> +}
> +
> +static inline void
> +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> +		__maybe_unused uint32_t buf_size,
> +		__maybe_unused uint32_t nr_buf)
> +{
> +#ifdef RTE_ARCH_X86_64
> +	char *data;
> +	struct rte_mbuf **srcs = array;
> +	uint32_t i, offset;
> +
> +	for (i = 0; i < nr_buf; i++) {
> +		data = rte_pktmbuf_mtod(srcs[i], char *);
> +		for (offset = 0; offset < buf_size; offset += 64)
> +			__builtin_ia32_clflush(data + offset);
> +	}
> +#endif
> +}
> +

<snip>

> +
> +/* Parse the argument given in the command line of the application */
> +static int
> +append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
> +{
> +	int i;
> +	char *tokens[MAX_EAL_PARAM_NB];
> +	char args[MAX_EAL_PARAM_LEN] = {0};
> +	int token_nb, new_argc = 0;
> +
> +	for (i = 0; i < argc; i++) {
> +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> +				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
> +			i++;
> +			continue;
> +		}
> +		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));

The type_of argv[new_argc] is *char. Cannot use sizeof().

strcpy(new_argv[new_argc], argv[i]); or strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN); ?

> +		new_argc++;
> +	}
> +
> +	if (eal_args) {
> +		strlcpy(args, eal_args, sizeof(args));
> +		token_nb = rte_strsplit(args, strlen(args),
> +					tokens, MAX_EAL_PARAM_NB, ' ');
> +		for (i = 0; i < token_nb; i++)
> +			strcpy(new_argv[new_argc++], tokens[i]);
> +	}
> +
> +	return new_argc;
> +}
> +
> +int
> +main(int argc, char *argv[])
> +{
> +	int ret;
> +	uint16_t case_nb;
> +	uint32_t i, nb_lcores;
> +	pid_t cpid, wpid;
> +	int wstatus;
> +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> +	char *pargs[MAX_EAL_PARAM_NB];
> +	char *cfg_path_ptr = NULL;
> +	char *rst_path_ptr = NULL;
> +	char rst_path[PATH_MAX];
> +	int new_argc;
> +	bool is_first_case = true;
> +
> +	memset(args, 0, sizeof(args));
> +
> +	for (i = 0; i < RTE_DIM(pargs); i++)
> +		pargs[i] = args[i];
> +
> +	for (i = 0; i < (uint32_t)argc; i++) {
> +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
> +			cfg_path_ptr = argv[i + 1];
> +		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
> +			rst_path_ptr = argv[i + 1];
> +	}
> +	if (cfg_path_ptr == NULL) {
> +		printf("Config file not assigned.\n");
> +		return -1;
> +	}
> +	if (rst_path_ptr == NULL) {
> +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> +		strcat(strtok(basename(rst_path), "."), "_result.csv");
> +		rst_path_ptr = rst_path;
> +	}
> +
> +	case_nb = load_configs(cfg_path_ptr);
> +	fd = fopen(rst_path_ptr, "w");
> +	if (fd == NULL) {
> +		printf("Open output CSV file error.\n");
> +		return -1;
> +	}
> +	fclose(fd);
> +
> +	for (i = 0; i < case_nb; i++) {
> +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> +			printf("No test type in test case %d.\n\n", i + 1);
> +			continue;
> +		}
> +		if (!test_cases[i].is_valid) {
> +			printf("Invalid test case %d.\n\n", i + 1);
> +			continue;
> +		}
> +
> +		cpid = fork();
> +		if (cpid < 0) {
> +			printf("Fork case %d failed.\n", i + 1);
> +			exit(EXIT_FAILURE);
> +		} else if (cpid == 0) {
> +			printf("\nRunning case %u\n\n", i + 1);
> +
> +			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
> +			ret = rte_eal_init(new_argc, pargs);
> +			if (ret < 0)
> +				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
> +
> +			/* Check lcores. */
> +			nb_lcores = rte_lcore_count();
> +			if (nb_lcores < 2)
> +				rte_exit(EXIT_FAILURE,
> +					"There should be at least 2 worker lcores.\n");
> +
> +			fd = fopen(rst_path_ptr, "a");
> +			if (!fd) {
> +				printf("Open output CSV file error.\n");
> +				return 0;
> +			}
> +
> +			if (is_first_case) {
> +				output_env_info();
> +				is_first_case = false;
> +			}
> +			run_test(i + 1, &test_cases[i]);
> +
> +			/* clean up the EAL */
> +			rte_eal_cleanup();
> +
> +			fclose(fd);
> +
> +			printf("\nCase %u completed.\n\n", i + 1);
> +
> +			exit(EXIT_SUCCESS);
> +		} else {
> +			wpid = waitpid(cpid, &wstatus, 0);
> +			if (wpid == -1) {
> +				printf("waitpid error.\n");
> +				exit(EXIT_FAILURE);
> +			}
> +
> +			if (WIFEXITED(wstatus))
> +				printf("Case process exited. status %d\n\n",
> +					WEXITSTATUS(wstatus));
> +			else if (WIFSIGNALED(wstatus))
> +				printf("Case process killed by signal %d\n\n",
> +					WTERMSIG(wstatus));
> +			else if (WIFSTOPPED(wstatus))
> +				printf("Case process stopped by signal %d\n\n",
> +					WSTOPSIG(wstatus));
> +			else if (WIFCONTINUED(wstatus))
> +				printf("Case process continued.\n\n");
> +			else
> +				printf("Case process unknown terminated.\n\n");
> +		}
> +	}
> +
> +	printf("Bye...\n");
> +	return 0;
> +}

<snip>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-13 12:55   ` huangdengdui
@ 2023-06-14  6:40     ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-14  6:40 UTC (permalink / raw)
  To: huangdengdui, thomas, Richardson, Bruce, mb, Xia, Chenbo,
	amitprakashs, anoobj
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang

Hi,

Thanks for your comments, replies are inline.

Thanks a again,
Cheng

> -----Original Message-----
> From: huangdengdui <huangdengdui@huawei.com>
> Sent: Tuesday, June 13, 2023 8:55 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> amitprakashs@marvell.com; anoobj@marvell.com
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Few comments inline. Please check.
> 
> Thanks,
> Dengdui
> 
> On 2023/6/13 12:31, Cheng Jiang wrote:
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre-set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v6:
> >   improved code based on Anoob's comments;
> >   fixed some code structure issues;
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 477 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 569
> ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1192 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode
> 100644
> > app/test-dma-perf/config.ini  create mode 100644
> > app/test-dma-perf/main.c  create mode 100644 app/test-dma-perf/main.h
> > create mode 100644 app/test-dma-perf/meson.build
> >
> > diff --git a/app/meson.build b/app/meson.build index
> > 74d2420f67..4fc1a83eba 100644
> > --- a/app/meson.build
> > +++ b/app/meson.build
> > @@ -19,6 +19,7 @@ apps = [
> >          'test-cmdline',
> >          'test-compress-perf',
> >          'test-crypto-perf',
> > +        'test-dma-perf',
> >          'test-eventdev',
> >          'test-fib',
> >          'test-flow-perf',
> > diff --git a/app/test-dma-perf/benchmark.c
> > b/app/test-dma-perf/benchmark.c new file mode 100644 index
> > 0000000000..bc1ca82297
> > --- /dev/null
> > +++ b/app/test-dma-perf/benchmark.c
> > @@ -0,0 +1,477 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2023 Intel Corporation  */
> > +
> 
> <snip>
> 
> > +static inline int
> > +__rte_format_printf(3, 4)
> > +print_err(const char *func, int lineno, const char *format, ...) {
> > +	va_list ap;
> > +	int ret;
> > +
> > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > +	va_start(ap, format);
> > +	ret += vfprintf(stderr, format, ap);
> > +	va_end(ap);
> > +
> > +	return ret;
> > +}
> > +
> > +static inline void
> > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers,
> uint16_t test_secs,
> > +				uint32_t total_cnt, float *memory, uint32_t
> *ave_cycle,
> > +				float *bandwidth, float *mops)
> > +{
> > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 *
> 1024);
> > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle))
> / 1000000000;
> > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
> 
> The value of ave_cycle may be 0.
> 
> *mops = (float)(total_cnt / test_secs) / 1000000; ?

OK, it makes sense to me. I'll fix it in the next version.

> 
> > +}
> > +
> > +static void
> > +output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
> uint64_t ave_cycle,
> > +			uint32_t buf_size, uint32_t nr_buf, float memory,
> > +			float bandwidth, float mops, bool is_dma) {
> > +	if (is_dma)
> > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > +	else
> > +		printf("lcore %u\n", lcore_id);
> > +
> > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u,
> memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > +			ave_cycle, buf_size, nr_buf, memory,
> rte_get_timer_hz());
> > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth,
> > +mops);
> > +
> > +	if (is_dma)
> > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> CSV_LINE_DMA_FMT,
> > +			scenario_id, lcore_id, dma_name, buf_size,
> > +			nr_buf, memory, ave_cycle, bandwidth, mops);
> > +	else
> > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> CSV_LINE_CPU_FMT,
> > +			scenario_id, lcore_id, buf_size,
> > +			nr_buf, memory, ave_cycle, bandwidth, mops); }
> > +
> > +static inline void
> > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > +		__maybe_unused uint32_t buf_size,
> > +		__maybe_unused uint32_t nr_buf)
> > +{
> > +#ifdef RTE_ARCH_X86_64
> > +	char *data;
> > +	struct rte_mbuf **srcs = array;
> > +	uint32_t i, offset;
> > +
> > +	for (i = 0; i < nr_buf; i++) {
> > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > +		for (offset = 0; offset < buf_size; offset += 64)
> > +			__builtin_ia32_clflush(data + offset);
> > +	}
> > +#endif
> > +}
> > +
> 
> <snip>
> 
> > +
> > +/* Parse the argument given in the command line of the application */
> > +static int append_eal_args(int argc, char **argv, const char
> > +*eal_args, char **new_argv) {
> > +	int i;
> > +	char *tokens[MAX_EAL_PARAM_NB];
> > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > +	int token_nb, new_argc = 0;
> > +
> > +	for (i = 0; i < argc; i++) {
> > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > +				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0))
> {
> > +			i++;
> > +			continue;
> > +		}
> > +		strlcpy(new_argv[new_argc], argv[i],
> sizeof(new_argv[new_argc]));
> 
> The type_of argv[new_argc] is *char. Cannot use sizeof().
> 
> strcpy(new_argv[new_argc], argv[i]); or strlcpy(new_argv[new_argc], argv[i],
> MAX_EAL_PARAM_LEN); ?

Yes, it's a mistake, thanks for pointing out! I'll fix it in the next version.

> 
> > +		new_argc++;
> > +	}
> > +
> > +	if (eal_args) {
> > +		strlcpy(args, eal_args, sizeof(args));
> > +		token_nb = rte_strsplit(args, strlen(args),
> > +					tokens, MAX_EAL_PARAM_NB, ' ');
> > +		for (i = 0; i < token_nb; i++)
> > +			strcpy(new_argv[new_argc++], tokens[i]);
> > +	}
> > +
> > +	return new_argc;
> > +}
> > +
> > +int
> > +main(int argc, char *argv[])
> > +{
> > +	int ret;
> > +	uint16_t case_nb;
> > +	uint32_t i, nb_lcores;
> > +	pid_t cpid, wpid;
> > +	int wstatus;
> > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > +	char *pargs[MAX_EAL_PARAM_NB];
> > +	char *cfg_path_ptr = NULL;
> > +	char *rst_path_ptr = NULL;
> > +	char rst_path[PATH_MAX];
> > +	int new_argc;
> > +	bool is_first_case = true;
> > +
> > +	memset(args, 0, sizeof(args));
> > +
> > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > +		pargs[i] = args[i];
> > +
> > +	for (i = 0; i < (uint32_t)argc; i++) {
> > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> MAX_LONG_OPT_SZ) == 0)
> > +			cfg_path_ptr = argv[i + 1];
> > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> MAX_LONG_OPT_SZ) == 0)
> > +			rst_path_ptr = argv[i + 1];
> > +	}
> > +	if (cfg_path_ptr == NULL) {
> > +		printf("Config file not assigned.\n");
> > +		return -1;
> > +	}
> > +	if (rst_path_ptr == NULL) {
> > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > +		strcat(strtok(basename(rst_path), "."), "_result.csv");
> > +		rst_path_ptr = rst_path;
> > +	}
> > +
> > +	case_nb = load_configs(cfg_path_ptr);
> > +	fd = fopen(rst_path_ptr, "w");
> > +	if (fd == NULL) {
> > +		printf("Open output CSV file error.\n");
> > +		return -1;
> > +	}
> > +	fclose(fd);
> > +
> > +	for (i = 0; i < case_nb; i++) {
> > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > +			printf("No test type in test case %d.\n\n", i + 1);
> > +			continue;
> > +		}
> > +		if (!test_cases[i].is_valid) {
> > +			printf("Invalid test case %d.\n\n", i + 1);
> > +			continue;
> > +		}
> > +
> > +		cpid = fork();
> > +		if (cpid < 0) {
> > +			printf("Fork case %d failed.\n", i + 1);
> > +			exit(EXIT_FAILURE);
> > +		} else if (cpid == 0) {
> > +			printf("\nRunning case %u\n\n", i + 1);
> > +
> > +			new_argc = append_eal_args(argc, argv,
> test_cases[i].eal_args, pargs);
> > +			ret = rte_eal_init(new_argc, pargs);
> > +			if (ret < 0)
> > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> arguments\n");
> > +
> > +			/* Check lcores. */
> > +			nb_lcores = rte_lcore_count();
> > +			if (nb_lcores < 2)
> > +				rte_exit(EXIT_FAILURE,
> > +					"There should be at least 2 worker
> lcores.\n");
> > +
> > +			fd = fopen(rst_path_ptr, "a");
> > +			if (!fd) {
> > +				printf("Open output CSV file error.\n");
> > +				return 0;
> > +			}
> > +
> > +			if (is_first_case) {
> > +				output_env_info();
> > +				is_first_case = false;
> > +			}
> > +			run_test(i + 1, &test_cases[i]);
> > +
> > +			/* clean up the EAL */
> > +			rte_eal_cleanup();
> > +
> > +			fclose(fd);
> > +
> > +			printf("\nCase %u completed.\n\n", i + 1);
> > +
> > +			exit(EXIT_SUCCESS);
> > +		} else {
> > +			wpid = waitpid(cpid, &wstatus, 0);
> > +			if (wpid == -1) {
> > +				printf("waitpid error.\n");
> > +				exit(EXIT_FAILURE);
> > +			}
> > +
> > +			if (WIFEXITED(wstatus))
> > +				printf("Case process exited. status %d\n\n",
> > +					WEXITSTATUS(wstatus));
> > +			else if (WIFSIGNALED(wstatus))
> > +				printf("Case process killed by signal %d\n\n",
> > +					WTERMSIG(wstatus));
> > +			else if (WIFSTOPPED(wstatus))
> > +				printf("Case process stopped by
> signal %d\n\n",
> > +					WSTOPSIG(wstatus));
> > +			else if (WIFCONTINUED(wstatus))
> > +				printf("Case process continued.\n\n");
> > +			else
> > +				printf("Case process unknown
> terminated.\n\n");
> > +		}
> > +	}
> > +
> > +	printf("Bye...\n");
> > +	return 0;
> > +}
> 
> <snip>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-13  4:31 ` [PATCH v6] " Cheng Jiang
  2023-06-13 12:55   ` huangdengdui
@ 2023-06-15  5:21   ` Anoob Joseph
  2023-06-15  8:01     ` Jiang, Cheng1
  1 sibling, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-15  5:21 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia,
	Amit Prakash Shukla
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he

Hi,

Thanks for working on the comments. Few more top level comment inline.

Thanks,
Anoob

> -----Original Message-----
> From: Cheng Jiang <cheng1.jiang@intel.com>
> Sent: Tuesday, June 13, 2023 10:02 AM
> To: thomas@monjalon.net; bruce.richardson@intel.com;
> mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>
> Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> Cheng Jiang <cheng1.jiang@intel.com>
> Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> External Email
> 
> ----------------------------------------------------------------------
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v6:
>   improved code based on Anoob's comments;
>   fixed some code structure issues;
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 477 ++++++++++++++++++++++++++++
> app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 569
> ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1192 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build
> 
> diff --git a/app/meson.build b/app/meson.build index
> 74d2420f67..4fc1a83eba 100644
> --- a/app/meson.build
> +++ b/app/meson.build
> @@ -19,6 +19,7 @@ apps = [
>          'test-cmdline',
>          'test-compress-perf',
>          'test-crypto-perf',
> +        'test-dma-perf',
>          'test-eventdev',
>          'test-fib',
>          'test-flow-perf',
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> perf/benchmark.c new file mode 100644 index 0000000000..bc1ca82297
> --- /dev/null
> +++ b/app/test-dma-perf/benchmark.c
> @@ -0,0 +1,477 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Intel Corporation
> + */
> +
> +#include <inttypes.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +
> +#include <rte_time.h>
> +#include <rte_mbuf.h>
> +#include <rte_dmadev.h>
> +#include <rte_malloc.h>
> +#include <rte_lcore.h>
> +
> +#include "main.h"
> +
> +#define MAX_DMA_CPL_NB 255
> +
> +#define TEST_WAIT_U_SECOND 10000
> +
> +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64
> ",%.3lf,%.3lf\n"
> +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64
> ",%.3lf,%.3lf\n"
> +
> +struct worker_info {
> +	bool ready_flag;
> +	bool start_flag;
> +	bool stop_flag;
> +	uint32_t total_cpl;
> +	uint32_t test_cpl;
> +};
> +
> +struct lcore_params {
> +	uint8_t scenario_id;
> +	unsigned int lcore_id;
> +	char *dma_name;
> +	uint16_t worker_id;
> +	uint16_t dev_id;
> +	uint32_t nr_buf;
> +	uint16_t kick_batch;
> +	uint32_t buf_size;
> +	uint16_t test_secs;
> +	struct rte_mbuf **srcs;
> +	struct rte_mbuf **dsts;
> +	struct worker_info worker_info;
> +};
> +
> +static struct rte_mempool *src_pool;
> +static struct rte_mempool *dst_pool;
> +
> +static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
> +
> +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> +
> +static inline int
> +__rte_format_printf(3, 4)
> +print_err(const char *func, int lineno, const char *format, ...) {
> +	va_list ap;
> +	int ret;
> +
> +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> +	va_start(ap, format);
> +	ret += vfprintf(stderr, format, ap);
> +	va_end(ap);
> +
> +	return ret;
> +}
> +
> +static inline void
> +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers,
> uint16_t test_secs,
> +				uint32_t total_cnt, float *memory, uint32_t
> *ave_cycle,
> +				float *bandwidth, float *mops)
> +{
> +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 *
> 1024);
> +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> (float)*ave_cycle)) / 1000000000;
> +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> +
> +static void
> +output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
> uint64_t ave_cycle,
> +			uint32_t buf_size, uint32_t nr_buf, float memory,
> +			float bandwidth, float mops, bool is_dma) {
> +	if (is_dma)
> +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> +	else
> +		printf("lcore %u\n", lcore_id);
> +
> +	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u,
> memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> +			ave_cycle, buf_size, nr_buf, memory,
> rte_get_timer_hz());
> +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth,
> +mops);
> +
> +	if (is_dma)
> +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> CSV_LINE_DMA_FMT,
> +			scenario_id, lcore_id, dma_name, buf_size,
> +			nr_buf, memory, ave_cycle, bandwidth, mops);
> +	else
> +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> CSV_LINE_CPU_FMT,
> +			scenario_id, lcore_id, buf_size,
> +			nr_buf, memory, ave_cycle, bandwidth, mops); }
> +
> +static inline void
> +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> +		__maybe_unused uint32_t buf_size,
> +		__maybe_unused uint32_t nr_buf)
> +{
> +#ifdef RTE_ARCH_X86_64
> +	char *data;
> +	struct rte_mbuf **srcs = array;
> +	uint32_t i, offset;
> +
> +	for (i = 0; i < nr_buf; i++) {
> +		data = rte_pktmbuf_mtod(srcs[i], char *);
> +		for (offset = 0; offset < buf_size; offset += 64)
> +			__builtin_ia32_clflush(data + offset);
> +	}
> +#endif
> +}
> +
> +/* Configuration of device. */
> +static void
> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> +	uint16_t vchan = 0;
> +	struct rte_dma_info info;
> +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> +	struct rte_dma_vchan_conf qconf = {
> +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> +		.nb_desc = ring_size
> +	};
> +
> +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> +
> +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> +
> +	rte_dma_info_get(dev_id, &info);
> +	if (info.nb_vchans != 1)
> +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> reported on device id. %u\n",
> +				dev_id);
> +
> +	if (rte_dma_start(dev_id) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> +
> +static int
> +config_dmadevs(struct test_configure *cfg) {
> +	uint32_t ring_size = cfg->ring_size.cur;
> +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	uint32_t nb_workers = ldm->cnt;
> +	uint32_t i;
> +	int dev_id;
> +	uint16_t nb_dmadevs = 0;
> +	char *dma_name;
> +
> +	for (i = 0; i < ldm->cnt; i++) {
> +		dma_name = ldm->dma_names[i];
> +		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> +		if (dev_id == -1) {
> +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> dma_name);
> +			goto end;
> +		}
> +
> +		ldm->dma_ids[i] = dev_id;
> +		configure_dmadev_queue(dev_id, ring_size);
> +		++nb_dmadevs;
> +	}
> +
> +end:
> +	if (nb_dmadevs < nb_workers) {
> +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> nb_dmadevs, nb_workers);
> +		return -1;
> +	}
> +
> +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> +
> +	return 0;
> +}
> +
> +#define POLL_MAX 1000
> +
> +

[Anoob] Extra blank line. You can consider removing.

> +static inline void
> +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> +			volatile struct worker_info *worker_info) {
> +	int ret;
> +	uint16_t nr_cpl;
> +
> +	ret = rte_dma_submit(dev_id, 0);
> +	if (ret < 0) {
> +		rte_dma_stop(dev_id);
> +		rte_dma_close(dev_id);
> +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> +	}
> +
> +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> NULL);
> +	*async_cnt -= nr_cpl;
> +	worker_info->total_cpl += nr_cpl;
> +}
> +
> +static inline int
> +do_dma_mem_copy(void *p)
> +{
> +	const uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = worker_params[*para_idx];
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	const uint16_t dev_id = para->dev_id;
> +	const uint32_t nr_buf = para->nr_buf;
> +	const uint16_t kick_batch = para->kick_batch;
> +	const uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	uint16_t nr_cpl;
> +	uint64_t async_cnt = 0;
> +	uint32_t i;
> +	uint32_t poll_cnt = 0;
> +	int ret;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +dma_copy:
> +			ret = rte_dma_copy(dev_id, 0,
> rte_pktmbuf_iova(srcs[i]),
> +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> +			if (unlikely(ret < 0)) {
> +				if (ret == -ENOSPC) {
> +					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> +					goto dma_copy;
> +				} else {
> +					/* Error exit */
> +					rte_dma_stop(dev_id);
> +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");
> +				}
> +			}
> +			async_cnt++;
> +
> +			if ((async_cnt % kick_batch) == 0)
> +				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +
> +static inline int
> +do_cpu_mem_copy(void *p)
> +{
> +	const uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = worker_params[*para_idx];
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	const uint32_t nr_buf = para->nr_buf;
> +	const uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	uint32_t i;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +			/* copy buffer form src to dst */
> +			rte_memcpy((void
> *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> +				(void
> *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> +				(size_t)buf_size);
> +			worker_info->total_cpl++;
> +		}
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> +			struct rte_mbuf ***dsts)
> +{
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	unsigned int nr_sockets;
> +	uint32_t nr_buf = cfg->nr_buf;
> +
> +	nr_sockets = rte_socket_count();
> +	if (cfg->src_numa_node >= nr_sockets ||
> +		cfg->dst_numa_node >= nr_sockets) {
> +		printf("Error: Source or destination numa exceeds the acture
> numa nodes.\n");
> +		return -1;
> +	}
> +
> +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> +			nr_buf, /* n == num elements */
> +			64,  /* cache size */
> +			0,   /* priv size */
> +			buf_size + RTE_PKTMBUF_HEADROOM,
> +			cfg->src_numa_node);
> +	if (src_pool == NULL) {
> +		PRINT_ERR("Error with source mempool creation.\n");
> +		return -1;
> +	}
> +
> +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> +			nr_buf, /* n == num elements */
> +			64,  /* cache size */

[Anoob] We do not alloc or free pointers in the datapath, right? So why bother with cache?

> +			0,   /* priv size */
> +			buf_size + RTE_PKTMBUF_HEADROOM,
> +			cfg->dst_numa_node);
> +	if (dst_pool == NULL) {
> +		PRINT_ERR("Error with destination mempool creation.\n");
> +		return -1;
> +	}
> +
> +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	if (*srcs == NULL) {
> +		printf("Error: srcs malloc failed.\n");
> +		return -1;
> +	}

[Anoob] Are we freeing these memory? The ones allocated with rte_malloc.

> +
> +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	if (*dsts == NULL) {
> +		printf("Error: dsts malloc failed.\n");
> +		return -1;
> +	}
> +
> +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
> +		printf("get src mbufs failed.\n");
> +		return -1;
> +	}
> +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
> +		printf("get dst mbufs failed.\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +void
> +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> +	uint16_t i;
> +	uint32_t offset;
> +	unsigned int lcore_id = 0;
> +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	uint16_t kick_batch = cfg->kick_batch.cur;
> +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> +	uint16_t nb_workers = ldm->cnt;
> +	uint16_t test_secs = cfg->test_secs;
> +	float memory;
> +	uint32_t avg_cycles = 0;
> +	float mops;
> +	float bandwidth;
> +
> +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +		goto out;
> +
> +	if (is_dma)
> +		if (config_dmadevs(cfg) < 0)
> +			goto out;
> +
> +	if (cfg->cache_flush) {
> +		cache_flush_buf(srcs, buf_size, nr_buf);
> +		cache_flush_buf(dsts, buf_size, nr_buf);
> +		rte_mb();
> +	}
> +
> +	printf("Start testing....\n");
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		lcore_id = ldm->lcores[i];
> +		offset = nr_buf / nb_workers * i;
> +
> +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> lcore_params), 0);
> +		if (!worker_params[i]) {
> +			printf("lcore parameters malloc failure for lcore
> %d\n", lcore_id);
> +			break;
> +		}

[Anoob] Are we freeing the above memory?

> +		if (is_dma) {
> +			worker_params[i]->dma_name = ldm-
> >dma_names[i];
> +			worker_params[i]->dev_id = ldm->dma_ids[i];
> +			worker_params[i]->kick_batch = kick_batch;
> +		}
> +		worker_params[i]->worker_id = i;
> +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> nb_workers);
> +		worker_params[i]->buf_size = buf_size;
> +		worker_params[i]->test_secs = test_secs;
> +		worker_params[i]->srcs = srcs + offset;
> +		worker_params[i]->dsts = dsts + offset;
> +		worker_params[i]->scenario_id = cfg->scenario_id;
> +		worker_params[i]->lcore_id = lcore_id;
> +
> +		if (is_dma)
> +			rte_eal_remote_launch(do_dma_mem_copy, (void
> *)(&i), lcore_id);
> +		else
> +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> *)(&i), lcore_id);
> +	}
> +
> +	while (1) {
> +		bool ready = true;
> +		for (i = 0; i < nb_workers; i++) {
> +			if (worker_params[i]->worker_info.ready_flag ==
> false) {
> +				ready = 0;
> +				break;
> +			}
> +		}
> +		if (ready)
> +			break;
> +	}
> +
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.start_flag = true;
> +
> +	usleep(TEST_WAIT_U_SECOND);
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.test_cpl =
> +worker_params[i]->worker_info.total_cpl;
> +
> +	usleep(test_secs * 1000 * 1000);
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.test_cpl =
> worker_params[i]->worker_info.total_cpl -
> +						worker_params[i]-
> >worker_info.test_cpl;
> +
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.stop_flag = true;
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> +			worker_params[i]->worker_info.test_cpl,
> +			&memory, &avg_cycles, &bandwidth, &mops);
> +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> +					worker_params[i]->dma_name,
> avg_cycles, buf_size,
> +					nr_buf / nb_workers, memory,
> bandwidth, mops, is_dma);
> +	}
> +
> +out:
> +	/* free env */
> +	if (srcs)
> +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> +	if (dsts)
> +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> +
> +	if (src_pool)
> +		rte_mempool_free(src_pool);
> +	if (dst_pool)
> +		rte_mempool_free(dst_pool);
> +
> +	if (is_dma) {
> +		for (i = 0; i < nb_workers; i++) {
> +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> +			rte_dma_stop(ldm->dma_ids[i]);
> +		}
> +	}
> +}
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini new
> file mode 100644 index 0000000000..2fd9c3c387
> --- /dev/null
> +++ b/app/test-dma-perf/config.ini
> @@ -0,0 +1,59 @@
> +
> +; This is an example configuration file for dma-perf, which details the
> +meanings of each parameter ; and instructions on how to use dma-perf.
> +
> +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
> +
> +; Parameters:
> +; "mem_size" denotes the size of the memory footprint.
> +; "buf_size" denotes the memory size of a single operation.
> +; "dma_ring_size" denotes the dma ring buffer size. It should be greater
> than 64 normally.
> +; "kick_batch" denotes the dma operation batch size, and should be greater
> than 1 normally.
> +
> +; The format for variables is variable=first,last,increment,ADD|MUL.
> +
> +; src_numa_node is used to control the numa node where the source
> memory is allocated.
> +; dst_numa_node is used to control the numa node where the destination
> memory is allocated.
> +
> +; cache_flush is used to determine whether or not the cache should be
> +flushed, with 1 indicating to ; flush and 0 indicating to not flush.
> +
> +; test_seconds controls the test time of the whole case.
> +
> +; To use DMA for a test, please specify the "lcore_dma" parameter.
> +; If you have already set the "-l" and "-a" parameters using EAL, ;
> +make sure that the value of "lcore_dma" falls within their range of the
> values.
> +
> +; To use CPU for a test, please specify the "lcore" parameter.
> +; If you have already set the "-l" and "-a" parameters using EAL, ;
> +make sure that the value of "lcore" falls within their range of values.
> +
> +; To specify a configuration file, use the "--config" flag followed by the path
> to the file.
> +
> +; To specify a result file, use the "--result" flag followed by the path to the
> file.
> +; If you do not specify a result file, one will be generated with the
> +same name as the configuration ; file, with the addition of "_result.csv" at
> the end.
> +
> +[case1]
> +type=DMA_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +test_seconds=2
> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3

[Anoob] Isn't it better if we allow user to specify DMA dev ID rather than the PCI DBDF?

In the long run, I would expect config file to provide {core, dma_dev_id, queue_id}

Another thought is why to expose this at all? If we can restrict this perf application to have one thread only use one vchan, then application can easily create this mapping in run time. Unless you want one thread to use 2 different vchans which may not be desirable since this is a standalone perf app.

> +eal_args=--in-memory --file-prefix=test
> +
> +[case2]
> +type=CPU_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +src_numa_node=0
> +dst_numa_node=1
> +cache_flush=0
> +test_seconds=2
> +lcore = 3, 4
> +eal_args=--in-memory --no-pci
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new file
> mode 100644 index 0000000000..d65655b87b
> --- /dev/null
> +++ b/app/test-dma-perf/main.c
> @@ -0,0 +1,569 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <unistd.h>
> +#include <sys/wait.h>
> +#include <inttypes.h>
> +#include <libgen.h>
> +
> +#include <rte_eal.h>
> +#include <rte_cfgfile.h>
> +#include <rte_string_fns.h>
> +#include <rte_lcore.h>
> +
> +#include "main.h"
> +
> +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> +
> +#define MAX_EAL_PARAM_NB 100
> +#define MAX_EAL_PARAM_LEN 1024
> +
> +#define DMA_MEM_COPY "DMA_MEM_COPY"
> +#define CPU_MEM_COPY "CPU_MEM_COPY"
> +
> +#define CMDLINE_CONFIG_ARG "--config"
> +#define CMDLINE_RESULT_ARG "--result"
> +
> +#define MAX_PARAMS_PER_ENTRY 4
> +
> +#define MAX_LONG_OPT_SZ 64
> +
> +enum {
> +	TEST_TYPE_NONE = 0,
> +	TEST_TYPE_DMA_MEM_COPY,
> +	TEST_TYPE_CPU_MEM_COPY
> +};
> +
> +#define MAX_TEST_CASES 16
> +static struct test_configure test_cases[MAX_TEST_CASES];
> +
> +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> +
> +static FILE *fd;
> +
> +static void
> +output_csv(bool need_blankline)
> +{
> +	uint32_t i;
> +
> +	if (need_blankline) {
> +		fprintf(fd, ",,,,,,,,\n");
> +		fprintf(fd, ",,,,,,,,\n");
> +	}
> +
> +	for (i = 0; i < RTE_DIM(output_str); i++) {
> +		if (output_str[i][0]) {
> +			fprintf(fd, "%s", output_str[i]);
> +			output_str[i][0] = '\0';
> +		}
> +	}
> +
> +	fflush(fd);
> +}
> +
> +static void
> +output_env_info(void)
> +{
> +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> environment:\n");
> +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
> +			PRIu64 "\n", rte_get_timer_hz());
> +
> +	output_csv(true);
> +}
> +
> +static void
> +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> +			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
> +
> +	output_csv(true);
> +}
> +
> +static void
> +run_test_case(struct test_configure *case_cfg) {
> +	switch (case_cfg->test_type) {
> +	case TEST_TYPE_DMA_MEM_COPY:
> +		mem_copy_benchmark(case_cfg, true);
> +		break;
> +	case TEST_TYPE_CPU_MEM_COPY:
> +		mem_copy_benchmark(case_cfg, false);
> +		break;
> +	default:
> +		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> +		break;
> +	}
> +}
> +
> +static void
> +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> +	uint32_t i;
> +	uint32_t nb_lcores = rte_lcore_count();
> +	struct test_configure_entry *mem_size = &case_cfg->mem_size;
> +	struct test_configure_entry *buf_size = &case_cfg->buf_size;
> +	struct test_configure_entry *ring_size = &case_cfg->ring_size;
> +	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
> +	struct test_configure_entry dummy = { 0 };
> +	struct test_configure_entry *var_entry = &dummy;
> +
> +	for (i = 0; i < RTE_DIM(output_str); i++)
> +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> +
> +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> +		printf("Case %u: Not enough lcores.\n", case_id);
> +		return;
> +	}
> +
> +	printf("Number of used lcores: %u.\n", nb_lcores);
> +
> +	if (mem_size->incr != 0)
> +		var_entry = mem_size;
> +
> +	if (buf_size->incr != 0)
> +		var_entry = buf_size;
> +
> +	if (ring_size->incr != 0)
> +		var_entry = ring_size;
> +
> +	if (kick_batch->incr != 0)
> +		var_entry = kick_batch;
> +
> +	case_cfg->scenario_id = 0;
> +
> +	output_header(case_id, case_cfg);
> +
> +	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry-
> >last;) {
> +		case_cfg->scenario_id++;
> +		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
> +
> +		run_test_case(case_cfg);
> +		output_csv(false);
> +
> +		if (var_entry->op == OP_ADD)
> +			var_entry->cur += var_entry->incr;
> +		else if (var_entry->op == OP_MUL)
> +			var_entry->cur *= var_entry->incr;
> +		else
> +			break;
> +	}
> +}
> +
> +static int
> +parse_lcore(struct test_configure *test_case, const char *value) {
> +	size_t len = strlen(value);
> +	char *input = (char *) malloc((len + 1) * sizeof(char));
> +	strcpy(input, value);
> +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> >lcore_dma_map);
> +
> +	if (test_case == NULL || value == NULL)
> +		return -1;
> +
> +	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
> +
> +	char *token = strtok(input, ", ");
> +	while (token != NULL) {
> +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> +			free(input);
> +			return -1;
> +		}
> +
> +		uint16_t lcore_id = atoi(token);
> +		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
> +
> +		token = strtok(NULL, ", ");
> +	}
> +
> +	free(input);
> +	return 0;
> +}
> +
> +static int
> +parse_lcore_dma(struct test_configure *test_case, const char *value) {
> +	struct lcore_dma_map_t *lcore_dma_map;
> +	char *input = strndup(value, strlen(value) + 1);
> +	char *addrs = input;
> +	char *ptrs[2];
> +	char *start, *end, *substr;
> +	uint16_t lcore_id;
> +	int ret = 0;
> +
> +	while (*addrs == '\0')
> +		addrs++;
> +	if (*addrs == '\0') {
> +		fprintf(stderr, "No input DMA addresses\n");
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	substr = strtok(addrs, ",");
> +	if (substr == NULL) {
> +		fprintf(stderr, "No input DMA address\n");
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> lcore_dma_map_t));
> +
> +	do {
> +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> +
> +		start = strstr(ptrs[0], "lcore");
> +		if (start == NULL) {
> +			fprintf(stderr, "Illegal lcore\n");
> +			ret = -1;
> +			break;
> +		}
> +
> +		start += 5;
> +		lcore_id = strtol(start, &end, 0);
> +		if (end == start) {
> +			fprintf(stderr, "No input lcore ID or ID %d is
> wrong\n", lcore_id);
> +			ret = -1;
> +			break;
> +		}
> +
> +		lcore_dma_map = &test_case->lcore_dma_map;
> +		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
> +		strcpy(lcore_dma_map->dma_names[lcore_dma_map-
> >cnt], ptrs[1]);
> +		lcore_dma_map->cnt++;
> +		substr = strtok(NULL, ",");
> +	} while (substr != NULL);
> +
> +out:
> +	free(input);
> +	return ret;
> +}
> +
> +static int
> +parse_entry(const char *value, struct test_configure_entry *entry) {
> +	char input[255] = {0};
> +	char *args[MAX_PARAMS_PER_ENTRY];
> +	int args_nr = -1;
> +
> +	if (value == NULL || entry == NULL)
> +		goto out;
> +
> +	strncpy(input, value, 254);
> +	if (*input == '\0')
> +		goto out;
> +
> +	args_nr = rte_strsplit(input, strlen(input), args,
> MAX_PARAMS_PER_ENTRY, ',');
> +	if (args_nr != 1 && args_nr != 4)
> +		goto out;
> +
> +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> +
> +	if (args_nr == 4) {
> +		entry->last = (uint32_t)atoi(args[1]);
> +		entry->incr = (uint32_t)atoi(args[2]);
> +		if (!strcmp(args[3], "MUL"))
> +			entry->op = OP_MUL;
> +		else if (!strcmp(args[3], "ADD"))
> +			entry->op = OP_ADD;
> +		else {
> +			printf("Invalid op %s.\n", args[3]);
> +			args_nr = -1;
> +		}
> +	} else {
> +		entry->op = OP_NONE;
> +		entry->last = 0;
> +		entry->incr = 0;
> +	}
> +out:
> +	return args_nr;
> +}
> +
> +static uint16_t
> +load_configs(const char *path)
> +{
> +	struct rte_cfgfile *cfgfile;
> +	int nb_sections, i;
> +	struct test_configure *test_case;
> +	char section_name[CFG_NAME_LEN];
> +	const char *case_type;
> +	const char *lcore_dma;
> +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> *kick_batch_str;
> +	int args_nr, nb_vp;
> +	bool is_dma;
> +
> +	printf("config file parsing...\n");
> +	cfgfile = rte_cfgfile_load(path, 0);
> +	if (!cfgfile) {
> +		printf("Open configure file error.\n");
> +		exit(1);
> +	}
> +
> +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> +	if (nb_sections > MAX_TEST_CASES) {
> +		printf("Error: The maximum number of cases is %d.\n",
> MAX_TEST_CASES);
> +		exit(1);
> +	}
> +
> +	for (i = 0; i < nb_sections; i++) {
> +		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
> +		test_case = &test_cases[i];
> +		case_type = rte_cfgfile_get_entry(cfgfile, section_name,
> "type");
> +		if (!case_type) {
> +			printf("Error: No case type in case %d, the test will be
> finished here.\n",
> +				i + 1);
> +			test_case->is_valid = false;
> +			continue;
> +		}
> +
> +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> +			test_case->test_type =
> TEST_TYPE_DMA_MEM_COPY;
> +			test_case->test_type_str = DMA_MEM_COPY;
> +			is_dma = true;
> +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> +			test_case->test_type =
> TEST_TYPE_CPU_MEM_COPY;
> +			test_case->test_type_str = CPU_MEM_COPY;
> +			is_dma = false;
> +		} else {
> +			printf("Error: Cannot find case type %s in case%d.\n",
> case_type, i + 1);
> +			test_case->is_valid = false;
> +			continue;
> +		}
> +
> +		nb_vp = 0;
> +
> +		test_case->src_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +
> 	section_name, "src_numa_node"));
> +		test_case->dst_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +
> 	section_name, "dst_numa_node"));
> +
> +		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> "mem_size");
> +		args_nr = parse_entry(mem_size_str, &test_case-
> >mem_size);
> +		if (args_nr < 0) {
> +			printf("parse error in case %d.\n", i + 1);
> +			test_case->is_valid = false;
> +			continue;
> +		} else if (args_nr > 1)
> +			nb_vp++;
> +
> +		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> "buf_size");
> +		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
> +		if (args_nr < 0) {
> +			printf("parse error in case %d.\n", i + 1);
> +			test_case->is_valid = false;
> +			continue;
> +		} else if (args_nr > 1)
> +			nb_vp++;
> +
> +		if (is_dma) {
> +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> +
> 	"dma_ring_size");
> +			args_nr = parse_entry(ring_size_str, &test_case-
> >ring_size);
> +			if (args_nr < 0) {
> +				printf("parse error in case %d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			} else if (args_nr > 1)
> +				nb_vp++;
> +
> +			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
> section_name, "kick_batch");
> +			args_nr = parse_entry(kick_batch_str, &test_case-
> >kick_batch);
> +			if (args_nr < 0) {
> +				printf("parse error in case %d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			} else if (args_nr > 1)
> +				nb_vp++;
> +
> +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> section_name, "lcore_dma");
> +			int lcore_ret = parse_lcore_dma(test_case,
> lcore_dma);
> +			if (lcore_ret < 0) {
> +				printf("parse lcore dma error in case %d.\n", i
> + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +		} else {
> +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> section_name, "lcore");
> +			int lcore_ret = parse_lcore(test_case, lcore_dma);
> +			if (lcore_ret < 0) {
> +				printf("parse lcore error in case %d.\n", i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			}
> +		}
> +
> +		if (nb_vp > 1) {
> +			printf("Error, each section can only have a single
> variable parameter.\n");
> +			test_case->is_valid = false;
> +			continue;
> +		}
> +
> +		test_case->cache_flush =
> +			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name,
> "cache_flush"));
> +		test_case->test_secs =
> (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> +					section_name, "test_seconds"));
> +
> +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> section_name, "eal_args");
> +		test_case->is_valid = true;
> +	}
> +
> +	rte_cfgfile_close(cfgfile);
> +	printf("config file parsing complete.\n\n");
> +	return i;
> +}
> +
> +/* Parse the argument given in the command line of the application */
> +static int append_eal_args(int argc, char **argv, const char *eal_args,
> +char **new_argv) {
> +	int i;
> +	char *tokens[MAX_EAL_PARAM_NB];
> +	char args[MAX_EAL_PARAM_LEN] = {0};
> +	int token_nb, new_argc = 0;
> +
> +	for (i = 0; i < argc; i++) {
> +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> +				(strcmp(argv[i], CMDLINE_RESULT_ARG) ==
> 0)) {
> +			i++;
> +			continue;
> +		}
> +		strlcpy(new_argv[new_argc], argv[i],
> sizeof(new_argv[new_argc]));
> +		new_argc++;
> +	}
> +
> +	if (eal_args) {
> +		strlcpy(args, eal_args, sizeof(args));
> +		token_nb = rte_strsplit(args, strlen(args),
> +					tokens, MAX_EAL_PARAM_NB, ' ');
> +		for (i = 0; i < token_nb; i++)
> +			strcpy(new_argv[new_argc++], tokens[i]);
> +	}
> +
> +	return new_argc;
> +}
> +
> +int
> +main(int argc, char *argv[])
> +{
> +	int ret;
> +	uint16_t case_nb;
> +	uint32_t i, nb_lcores;
> +	pid_t cpid, wpid;
> +	int wstatus;
> +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> +	char *pargs[MAX_EAL_PARAM_NB];
> +	char *cfg_path_ptr = NULL;
> +	char *rst_path_ptr = NULL;
> +	char rst_path[PATH_MAX];
> +	int new_argc;
> +	bool is_first_case = true;
> +
> +	memset(args, 0, sizeof(args));
> +
> +	for (i = 0; i < RTE_DIM(pargs); i++)
> +		pargs[i] = args[i];
> +
> +	for (i = 0; i < (uint32_t)argc; i++) {
> +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> MAX_LONG_OPT_SZ) == 0)
> +			cfg_path_ptr = argv[i + 1];
> +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> MAX_LONG_OPT_SZ) == 0)
> +			rst_path_ptr = argv[i + 1];
> +	}
> +	if (cfg_path_ptr == NULL) {
> +		printf("Config file not assigned.\n");
> +		return -1;
> +	}
> +	if (rst_path_ptr == NULL) {
> +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> +		strcat(strtok(basename(rst_path), "."), "_result.csv");
> +		rst_path_ptr = rst_path;
> +	}
> +
> +	case_nb = load_configs(cfg_path_ptr);
> +	fd = fopen(rst_path_ptr, "w");
> +	if (fd == NULL) {
> +		printf("Open output CSV file error.\n");
> +		return -1;
> +	}
> +	fclose(fd);
> +
> +	for (i = 0; i < case_nb; i++) {
> +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> +			printf("No test type in test case %d.\n\n", i + 1);
> +			continue;
> +		}
> +		if (!test_cases[i].is_valid) {
> +			printf("Invalid test case %d.\n\n", i + 1);
> +			continue;
> +		}
> +
> +		cpid = fork();

[Anoob] Do we really need fork()? Can't we use code like,

		RTE_LCORE_FOREACH_WORKER(lcore_id) {
			ret |= rte_eal_wait_lcore(lcore_id);
		}

to wait for all threads to exit?

> +		if (cpid < 0) {
> +			printf("Fork case %d failed.\n", i + 1);
> +			exit(EXIT_FAILURE);
> +		} else if (cpid == 0) {
> +			printf("\nRunning case %u\n\n", i + 1);
> +
> +			new_argc = append_eal_args(argc, argv,
> test_cases[i].eal_args, pargs);
> +			ret = rte_eal_init(new_argc, pargs);
> +			if (ret < 0)
> +				rte_exit(EXIT_FAILURE, "Invalid EAL
> arguments\n");
> +
> +			/* Check lcores. */
> +			nb_lcores = rte_lcore_count();
> +			if (nb_lcores < 2)
> +				rte_exit(EXIT_FAILURE,
> +					"There should be at least 2 worker
> lcores.\n");
> +
> +			fd = fopen(rst_path_ptr, "a");
> +			if (!fd) {
> +				printf("Open output CSV file error.\n");
> +				return 0;
> +			}
> +
> +			if (is_first_case) {
> +				output_env_info();
> +				is_first_case = false;
> +			}
> +			run_test(i + 1, &test_cases[i]);
> +
> +			/* clean up the EAL */
> +			rte_eal_cleanup();
> +
> +			fclose(fd);
> +
> +			printf("\nCase %u completed.\n\n", i + 1);
> +
> +			exit(EXIT_SUCCESS);
> +		} else {
> +			wpid = waitpid(cpid, &wstatus, 0);
> +			if (wpid == -1) {
> +				printf("waitpid error.\n");
> +				exit(EXIT_FAILURE);
> +			}
> +
> +			if (WIFEXITED(wstatus))
> +				printf("Case process exited. status %d\n\n",
> +					WEXITSTATUS(wstatus));
> +			else if (WIFSIGNALED(wstatus))
> +				printf("Case process killed by signal %d\n\n",
> +					WTERMSIG(wstatus));
> +			else if (WIFSTOPPED(wstatus))
> +				printf("Case process stopped by signal
> %d\n\n",
> +					WSTOPSIG(wstatus));
> +			else if (WIFCONTINUED(wstatus))
> +				printf("Case process continued.\n\n");
> +			else
> +				printf("Case process unknown
> terminated.\n\n");
> +		}
> +	}
> +
> +	printf("Bye...\n");
> +	return 0;
> +}
> +
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new file
> mode 100644 index 0000000000..215ac42673
> --- /dev/null
> +++ b/app/test-dma-perf/main.h
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Intel Corporation
> + */
> +
> +#ifndef _MAIN_H_
> +#define _MAIN_H_
> +
> +
> +#include <rte_common.h>
> +#include <rte_cycles.h>
> +#include <rte_dev.h>
> +#include <rte_dmadev.h>
> +
> +#ifndef __maybe_unused
> +#define __maybe_unused	__rte_unused
> +#endif
> +
> +#define MAX_WORKER_NB 128
> +#define MAX_OUTPUT_STR_LEN 512
> +
> +#define MAX_DMA_NB 128
> +#define MAX_LCORE_NB 256
> +
> +extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> +
> +typedef enum {
> +	OP_NONE = 0,
> +	OP_ADD,
> +	OP_MUL
> +} alg_op_type;
> +
> +struct test_configure_entry {
> +	uint32_t first;
> +	uint32_t last;
> +	uint32_t incr;
> +	alg_op_type op;
> +	uint32_t cur;
> +};
> +
> +struct lcore_dma_map_t {
> +	uint32_t lcores[MAX_WORKER_NB];
> +	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> +	int16_t dma_ids[MAX_WORKER_NB];
> +	uint16_t cnt;
> +};
> +
> +struct test_configure {
> +	bool is_valid;
> +	uint8_t test_type;
> +	const char *test_type_str;
> +	uint16_t src_numa_node;
> +	uint16_t dst_numa_node;
> +	uint16_t opcode;
> +	bool is_dma;
> +	struct lcore_dma_map_t lcore_dma_map;
> +	struct test_configure_entry mem_size;
> +	struct test_configure_entry buf_size;
> +	struct test_configure_entry ring_size;
> +	struct test_configure_entry kick_batch;
> +	uint32_t cache_flush;
> +	uint32_t nr_buf;
> +	uint16_t test_secs;
> +	const char *eal_args;
> +	uint8_t scenario_id;
> +};
> +
> +void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> +
> +#endif /* _MAIN_H_ */
> diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> --- /dev/null
> +++ b/app/test-dma-perf/meson.build
> @@ -0,0 +1,17 @@
> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023 Intel
> +Corporation
> +
> +# meson file, for building this app as part of a main DPDK build.
> +
> +if is_windows
> +    build = false
> +    reason = 'not supported on Windows'
> +    subdir_done()
> +endif
> +
> +deps += ['dmadev', 'mbuf', 'cfgfile']
> +
> +sources = files(
> +        'main.c',
> +        'benchmark.c',
> +)
> --
> 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-15  5:21   ` [EXT] " Anoob Joseph
@ 2023-06-15  8:01     ` Jiang, Cheng1
  2023-06-15  8:44       ` Anoob Joseph
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-15  8:01 UTC (permalink / raw)
  To: Anoob Joseph, thomas, Richardson, Bruce, mb, Xia, Chenbo,
	Amit Prakash Shukla
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang

Hi,

Thanks for your comments, the replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Thursday, June 15, 2023 1:22 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>; mb@smartsharesystems.com;
> Xia, Chenbo <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Thanks for working on the comments. Few more top level comment inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Cheng Jiang <cheng1.jiang@intel.com>
> > Sent: Tuesday, June 13, 2023 10:02 AM
> > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>
> > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> > Cheng Jiang <cheng1.jiang@intel.com>
> > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > There are many high-performance DMA devices supported in DPDK now, and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre- set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v6:
> >   improved code based on Anoob's comments;
> >   fixed some code structure issues;
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 477 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 569
> > ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1192 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> > 100644 app/test-dma-perf/meson.build
> >
> > diff --git a/app/meson.build b/app/meson.build index
> > 74d2420f67..4fc1a83eba 100644
> > --- a/app/meson.build
> > +++ b/app/meson.build
> > @@ -19,6 +19,7 @@ apps = [
> >          'test-cmdline',
> >          'test-compress-perf',
> >          'test-crypto-perf',
> > +        'test-dma-perf',
> >          'test-eventdev',
> >          'test-fib',
> >          'test-flow-perf',
> > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > perf/benchmark.c new file mode 100644 index 0000000000..bc1ca82297
> > --- /dev/null
> > +++ b/app/test-dma-perf/benchmark.c
> > @@ -0,0 +1,477 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2023 Intel Corporation  */
> > +
> > +#include <inttypes.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <unistd.h>
> > +
> > +#include <rte_time.h>
> > +#include <rte_mbuf.h>
> > +#include <rte_dmadev.h>
> > +#include <rte_malloc.h>
> > +#include <rte_lcore.h>
> > +
> > +#include "main.h"
> > +
> > +#define MAX_DMA_CPL_NB 255
> > +
> > +#define TEST_WAIT_U_SECOND 10000
> > +
> > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64
> > ",%.3lf,%.3lf\n"
> > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64
> > ",%.3lf,%.3lf\n"
> > +
> > +struct worker_info {
> > +	bool ready_flag;
> > +	bool start_flag;
> > +	bool stop_flag;
> > +	uint32_t total_cpl;
> > +	uint32_t test_cpl;
> > +};
> > +
> > +struct lcore_params {
> > +	uint8_t scenario_id;
> > +	unsigned int lcore_id;
> > +	char *dma_name;
> > +	uint16_t worker_id;
> > +	uint16_t dev_id;
> > +	uint32_t nr_buf;
> > +	uint16_t kick_batch;
> > +	uint32_t buf_size;
> > +	uint16_t test_secs;
> > +	struct rte_mbuf **srcs;
> > +	struct rte_mbuf **dsts;
> > +	struct worker_info worker_info;
> > +};
> > +
> > +static struct rte_mempool *src_pool;
> > +static struct rte_mempool *dst_pool;
> > +
> > +static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
> > +
> > +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> > +
> > +static inline int
> > +__rte_format_printf(3, 4)
> > +print_err(const char *func, int lineno, const char *format, ...) {
> > +	va_list ap;
> > +	int ret;
> > +
> > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > +	va_start(ap, format);
> > +	ret += vfprintf(stderr, format, ap);
> > +	va_end(ap);
> > +
> > +	return ret;
> > +}
> > +
> > +static inline void
> > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers,
> > uint16_t test_secs,
> > +				uint32_t total_cnt, float *memory, uint32_t
> > *ave_cycle,
> > +				float *bandwidth, float *mops)
> > +{
> > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 *
> > 1024);
> > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > (float)*ave_cycle)) / 1000000000;
> > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > +
> > +static void
> > +output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
> > uint64_t ave_cycle,
> > +			uint32_t buf_size, uint32_t nr_buf, float memory,
> > +			float bandwidth, float mops, bool is_dma) {
> > +	if (is_dma)
> > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > +	else
> > +		printf("lcore %u\n", lcore_id);
> > +
> > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u,
> > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > +			ave_cycle, buf_size, nr_buf, memory,
> > rte_get_timer_hz());
> > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth,
> > +mops);
> > +
> > +	if (is_dma)
> > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > CSV_LINE_DMA_FMT,
> > +			scenario_id, lcore_id, dma_name, buf_size,
> > +			nr_buf, memory, ave_cycle, bandwidth, mops);
> > +	else
> > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > CSV_LINE_CPU_FMT,
> > +			scenario_id, lcore_id, buf_size,
> > +			nr_buf, memory, ave_cycle, bandwidth, mops); }
> > +
> > +static inline void
> > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > +		__maybe_unused uint32_t buf_size,
> > +		__maybe_unused uint32_t nr_buf)
> > +{
> > +#ifdef RTE_ARCH_X86_64
> > +	char *data;
> > +	struct rte_mbuf **srcs = array;
> > +	uint32_t i, offset;
> > +
> > +	for (i = 0; i < nr_buf; i++) {
> > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > +		for (offset = 0; offset < buf_size; offset += 64)
> > +			__builtin_ia32_clflush(data + offset);
> > +	}
> > +#endif
> > +}
> > +
> > +/* Configuration of device. */
> > +static void
> > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > +	uint16_t vchan = 0;
> > +	struct rte_dma_info info;
> > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > +	struct rte_dma_vchan_conf qconf = {
> > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > +		.nb_desc = ring_size
> > +	};
> > +
> > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> > +
> > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> > +
> > +	rte_dma_info_get(dev_id, &info);
> > +	if (info.nb_vchans != 1)
> > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > reported on device id. %u\n",
> > +				dev_id);
> > +
> > +	if (rte_dma_start(dev_id) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > +
> > +static int
> > +config_dmadevs(struct test_configure *cfg) {
> > +	uint32_t ring_size = cfg->ring_size.cur;
> > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	uint32_t nb_workers = ldm->cnt;
> > +	uint32_t i;
> > +	int dev_id;
> > +	uint16_t nb_dmadevs = 0;
> > +	char *dma_name;
> > +
> > +	for (i = 0; i < ldm->cnt; i++) {
> > +		dma_name = ldm->dma_names[i];
> > +		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> > +		if (dev_id == -1) {
> > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > dma_name);
> > +			goto end;
> > +		}
> > +
> > +		ldm->dma_ids[i] = dev_id;
> > +		configure_dmadev_queue(dev_id, ring_size);
> > +		++nb_dmadevs;
> > +	}
> > +
> > +end:
> > +	if (nb_dmadevs < nb_workers) {
> > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> > nb_dmadevs, nb_workers);
> > +		return -1;
> > +	}
> > +
> > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > +
> > +	return 0;
> > +}
> > +
> > +#define POLL_MAX 1000
> > +
> > +
> 
> [Anoob] Extra blank line. You can consider removing.

[Cheng] sure, sorry for the miss.

> 
> > +static inline void
> > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > +			volatile struct worker_info *worker_info) {
> > +	int ret;
> > +	uint16_t nr_cpl;
> > +
> > +	ret = rte_dma_submit(dev_id, 0);
> > +	if (ret < 0) {
> > +		rte_dma_stop(dev_id);
> > +		rte_dma_close(dev_id);
> > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > +	}
> > +
> > +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> > NULL);
> > +	*async_cnt -= nr_cpl;
> > +	worker_info->total_cpl += nr_cpl;
> > +}
> > +
> > +static inline int
> > +do_dma_mem_copy(void *p)
> > +{
> > +	const uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	const uint16_t dev_id = para->dev_id;
> > +	const uint32_t nr_buf = para->nr_buf;
> > +	const uint16_t kick_batch = para->kick_batch;
> > +	const uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	uint16_t nr_cpl;
> > +	uint64_t async_cnt = 0;
> > +	uint32_t i;
> > +	uint32_t poll_cnt = 0;
> > +	int ret;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +dma_copy:
> > +			ret = rte_dma_copy(dev_id, 0,
> > rte_pktmbuf_iova(srcs[i]),
> > +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> > +			if (unlikely(ret < 0)) {
> > +				if (ret == -ENOSPC) {
> > +					do_dma_submit_and_poll(dev_id,
> > &async_cnt, worker_info);
> > +					goto dma_copy;
> > +				} else {
> > +					/* Error exit */
> > +					rte_dma_stop(dev_id);
> > +					rte_exit(EXIT_FAILURE, "DMA
> > enqueue failed\n");
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			if ((async_cnt % kick_batch) == 0)
> > +				do_dma_submit_and_poll(dev_id,
> > &async_cnt, worker_info);
> > +		}
> > +
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	rte_dma_submit(dev_id, 0);
> > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +		async_cnt -= nr_cpl;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static inline int
> > +do_cpu_mem_copy(void *p)
> > +{
> > +	const uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	const uint32_t nr_buf = para->nr_buf;
> > +	const uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	uint32_t i;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +			/* copy buffer form src to dst */
> > +			rte_memcpy((void
> > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > +				(void
> > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > +				(size_t)buf_size);
> > +			worker_info->total_cpl++;
> > +		}
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> > +			struct rte_mbuf ***dsts)
> > +{
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	unsigned int nr_sockets;
> > +	uint32_t nr_buf = cfg->nr_buf;
> > +
> > +	nr_sockets = rte_socket_count();
> > +	if (cfg->src_numa_node >= nr_sockets ||
> > +		cfg->dst_numa_node >= nr_sockets) {
> > +		printf("Error: Source or destination numa exceeds the acture
> > numa nodes.\n");
> > +		return -1;
> > +	}
> > +
> > +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > +			nr_buf, /* n == num elements */
> > +			64,  /* cache size */
> > +			0,   /* priv size */
> > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > +			cfg->src_numa_node);
> > +	if (src_pool == NULL) {
> > +		PRINT_ERR("Error with source mempool creation.\n");
> > +		return -1;
> > +	}
> > +
> > +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > +			nr_buf, /* n == num elements */
> > +			64,  /* cache size */
> 
> [Anoob] We do not alloc or free pointers in the datapath, right? So why bother
> with cache?

[Cheng] Yes, you are right, the cache size is not necessary here, I'll fix it in the next version.

> 
> > +			0,   /* priv size */
> > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > +			cfg->dst_numa_node);
> > +	if (dst_pool == NULL) {
> > +		PRINT_ERR("Error with destination mempool creation.\n");
> > +		return -1;
> > +	}
> > +
> > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > +	if (*srcs == NULL) {
> > +		printf("Error: srcs malloc failed.\n");
> > +		return -1;
> > +	}
> 
> [Anoob] Are we freeing these memory? The ones allocated with rte_malloc.

[Cheng] yes, we freed the memory in the end of mem_copy_benchmark() when we finished the test.

> 
> > +
> > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > +	if (*dsts == NULL) {
> > +		printf("Error: dsts malloc failed.\n");
> > +		return -1;
> > +	}
> > +
> > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
> > +		printf("get src mbufs failed.\n");
> > +		return -1;
> > +	}
> > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
> > +		printf("get dst mbufs failed.\n");
> > +		return -1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +void
> > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > +	uint16_t i;
> > +	uint32_t offset;
> > +	unsigned int lcore_id = 0;
> > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> > (cfg->buf_size.cur * 2);
> > +	uint16_t nb_workers = ldm->cnt;
> > +	uint16_t test_secs = cfg->test_secs;
> > +	float memory;
> > +	uint32_t avg_cycles = 0;
> > +	float mops;
> > +	float bandwidth;
> > +
> > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +		goto out;
> > +
> > +	if (is_dma)
> > +		if (config_dmadevs(cfg) < 0)
> > +			goto out;
> > +
> > +	if (cfg->cache_flush) {
> > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > +		rte_mb();
> > +	}
> > +
> > +	printf("Start testing....\n");
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		lcore_id = ldm->lcores[i];
> > +		offset = nr_buf / nb_workers * i;
> > +
> > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > lcore_params), 0);
> > +		if (!worker_params[i]) {
> > +			printf("lcore parameters malloc failure for lcore
> > %d\n", lcore_id);
> > +			break;
> > +		}
> 
> [Anoob] Are we freeing the above memory?

[Cheng] sorry, I missed that, I'll add worker_params memory free in the next version, thanks.

> 
> > +		if (is_dma) {
> > +			worker_params[i]->dma_name = ldm-
> > >dma_names[i];
> > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > +			worker_params[i]->kick_batch = kick_batch;
> > +		}
> > +		worker_params[i]->worker_id = i;
> > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > nb_workers);
> > +		worker_params[i]->buf_size = buf_size;
> > +		worker_params[i]->test_secs = test_secs;
> > +		worker_params[i]->srcs = srcs + offset;
> > +		worker_params[i]->dsts = dsts + offset;
> > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > +		worker_params[i]->lcore_id = lcore_id;
> > +
> > +		if (is_dma)
> > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > *)(&i), lcore_id);
> > +		else
> > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > *)(&i), lcore_id);
> > +	}
> > +
> > +	while (1) {
> > +		bool ready = true;
> > +		for (i = 0; i < nb_workers; i++) {
> > +			if (worker_params[i]->worker_info.ready_flag ==
> > false) {
> > +				ready = 0;
> > +				break;
> > +			}
> > +		}
> > +		if (ready)
> > +			break;
> > +	}
> > +
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.start_flag = true;
> > +
> > +	usleep(TEST_WAIT_U_SECOND);
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.test_cpl =
> > +worker_params[i]->worker_info.total_cpl;
> > +
> > +	usleep(test_secs * 1000 * 1000);
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.test_cpl =
> > worker_params[i]->worker_info.total_cpl -
> > +						worker_params[i]-
> > >worker_info.test_cpl;
> > +
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.stop_flag = true;
> > +
> > +	rte_eal_mp_wait_lcore();
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > +			worker_params[i]->worker_info.test_cpl,
> > +			&memory, &avg_cycles, &bandwidth, &mops);
> > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > +					worker_params[i]->dma_name,
> > avg_cycles, buf_size,
> > +					nr_buf / nb_workers, memory,
> > bandwidth, mops, is_dma);
> > +	}
> > +
> > +out:
> > +	/* free env */
> > +	if (srcs)
> > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > +	if (dsts)
> > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > +
> > +	if (src_pool)
> > +		rte_mempool_free(src_pool);
> > +	if (dst_pool)
> > +		rte_mempool_free(dst_pool);
> > +
> > +	if (is_dma) {
> > +		for (i = 0; i < nb_workers; i++) {
> > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > +			rte_dma_stop(ldm->dma_ids[i]);
> > +		}
> > +	}
> > +}
> > diff --git a/app/test-dma-perf/config.ini
> > b/app/test-dma-perf/config.ini new file mode 100644 index
> > 0000000000..2fd9c3c387
> > --- /dev/null
> > +++ b/app/test-dma-perf/config.ini
> > @@ -0,0 +1,59 @@
> > +
> > +; This is an example configuration file for dma-perf, which details
> > +the meanings of each parameter ; and instructions on how to use dma-perf.
> > +
> > +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
> > +
> > +; Parameters:
> > +; "mem_size" denotes the size of the memory footprint.
> > +; "buf_size" denotes the memory size of a single operation.
> > +; "dma_ring_size" denotes the dma ring buffer size. It should be
> > +greater
> > than 64 normally.
> > +; "kick_batch" denotes the dma operation batch size, and should be
> > +greater
> > than 1 normally.
> > +
> > +; The format for variables is variable=first,last,increment,ADD|MUL.
> > +
> > +; src_numa_node is used to control the numa node where the source
> > memory is allocated.
> > +; dst_numa_node is used to control the numa node where the
> > +destination
> > memory is allocated.
> > +
> > +; cache_flush is used to determine whether or not the cache should be
> > +flushed, with 1 indicating to ; flush and 0 indicating to not flush.
> > +
> > +; test_seconds controls the test time of the whole case.
> > +
> > +; To use DMA for a test, please specify the "lcore_dma" parameter.
> > +; If you have already set the "-l" and "-a" parameters using EAL, ;
> > +make sure that the value of "lcore_dma" falls within their range of
> > +the
> > values.
> > +
> > +; To use CPU for a test, please specify the "lcore" parameter.
> > +; If you have already set the "-l" and "-a" parameters using EAL, ;
> > +make sure that the value of "lcore" falls within their range of values.
> > +
> > +; To specify a configuration file, use the "--config" flag followed
> > +by the path
> > to the file.
> > +
> > +; To specify a result file, use the "--result" flag followed by the
> > +path to the
> > file.
> > +; If you do not specify a result file, one will be generated with the
> > +same name as the configuration ; file, with the addition of
> > +"_result.csv" at
> > the end.
> > +
> > +[case1]
> > +type=DMA_MEM_COPY
> > +mem_size=10
> > +buf_size=64,8192,2,MUL
> > +dma_ring_size=1024
> > +kick_batch=32
> > +src_numa_node=0
> > +dst_numa_node=0
> > +cache_flush=0
> > +test_seconds=2
> > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> 
> [Anoob] Isn't it better if we allow user to specify DMA dev ID rather than the PCI
> DBDF?
> 
> In the long run, I would expect config file to provide {core, dma_dev_id,
> queue_id}
> 
> Another thought is why to expose this at all? If we can restrict this perf
> application to have one thread only use one vchan, then application can easily
> create this mapping in run time. Unless you want one thread to use 2 different
> vchans which may not be desirable since this is a standalone perf app.

[Cheng] Thank you for the feedback.
Here are my thoughts:
Firstly, the user may not know which device the DMA dev ID corresponds to, or which NUMA node it is on. In my example, I used the CBDMA environment, so I did not specify the work queue ID. When using DSA, the configuration would be something like lcore10@0000:00:04.2-q0 which contains core, dma and work queue id. The reason for exposing these options is that we want the user to fully understand which cores and devices are being used so that they know exactly where the performance data is coming from. For example, performance when cores and DMA devices are not on the same NUMA node, etc. This allows the testing scenario to be precise and flexible. If the application handles the mapping itself, the user loses control over the mapping and may not get the performance data they want. We believe control should be given to the user rather than the application.  

> 
> > +eal_args=--in-memory --file-prefix=test
> > +
> > +[case2]
> > +type=CPU_MEM_COPY
> > +mem_size=10
> > +buf_size=64,8192,2,MUL
> > +src_numa_node=0
> > +dst_numa_node=1
> > +cache_flush=0
> > +test_seconds=2
> > +lcore = 3, 4
> > +eal_args=--in-memory --no-pci
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new
> > file mode 100644 index 0000000000..d65655b87b
> > --- /dev/null
> > +++ b/app/test-dma-perf/main.c
> > @@ -0,0 +1,569 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2023 Intel Corporation  */
> > +
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <getopt.h>
> > +#include <signal.h>
> > +#include <stdbool.h>
> > +#include <unistd.h>
> > +#include <sys/wait.h>
> > +#include <inttypes.h>
> > +#include <libgen.h>
> > +
> > +#include <rte_eal.h>
> > +#include <rte_cfgfile.h>
> > +#include <rte_string_fns.h>
> > +#include <rte_lcore.h>
> > +
> > +#include "main.h"
> > +
> > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > +
> > +#define MAX_EAL_PARAM_NB 100
> > +#define MAX_EAL_PARAM_LEN 1024
> > +
> > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > +
> > +#define CMDLINE_CONFIG_ARG "--config"
> > +#define CMDLINE_RESULT_ARG "--result"
> > +
> > +#define MAX_PARAMS_PER_ENTRY 4
> > +
> > +#define MAX_LONG_OPT_SZ 64
> > +
> > +enum {
> > +	TEST_TYPE_NONE = 0,
> > +	TEST_TYPE_DMA_MEM_COPY,
> > +	TEST_TYPE_CPU_MEM_COPY
> > +};
> > +
> > +#define MAX_TEST_CASES 16
> > +static struct test_configure test_cases[MAX_TEST_CASES];
> > +
> > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > +
> > +static FILE *fd;
> > +
> > +static void
> > +output_csv(bool need_blankline)
> > +{
> > +	uint32_t i;
> > +
> > +	if (need_blankline) {
> > +		fprintf(fd, ",,,,,,,,\n");
> > +		fprintf(fd, ",,,,,,,,\n");
> > +	}
> > +
> > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > +		if (output_str[i][0]) {
> > +			fprintf(fd, "%s", output_str[i]);
> > +			output_str[i][0] = '\0';
> > +		}
> > +	}
> > +
> > +	fflush(fd);
> > +}
> > +
> > +static void
> > +output_env_info(void)
> > +{
> > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > environment:\n");
> > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
> > +			PRIu64 "\n", rte_get_timer_hz());
> > +
> > +	output_csv(true);
> > +}
> > +
> > +static void
> > +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > +			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
> > +
> > +	output_csv(true);
> > +}
> > +
> > +static void
> > +run_test_case(struct test_configure *case_cfg) {
> > +	switch (case_cfg->test_type) {
> > +	case TEST_TYPE_DMA_MEM_COPY:
> > +		mem_copy_benchmark(case_cfg, true);
> > +		break;
> > +	case TEST_TYPE_CPU_MEM_COPY:
> > +		mem_copy_benchmark(case_cfg, false);
> > +		break;
> > +	default:
> > +		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> > +		break;
> > +	}
> > +}
> > +
> > +static void
> > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > +	uint32_t i;
> > +	uint32_t nb_lcores = rte_lcore_count();
> > +	struct test_configure_entry *mem_size = &case_cfg->mem_size;
> > +	struct test_configure_entry *buf_size = &case_cfg->buf_size;
> > +	struct test_configure_entry *ring_size = &case_cfg->ring_size;
> > +	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
> > +	struct test_configure_entry dummy = { 0 };
> > +	struct test_configure_entry *var_entry = &dummy;
> > +
> > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > +
> > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > +		printf("Case %u: Not enough lcores.\n", case_id);
> > +		return;
> > +	}
> > +
> > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > +
> > +	if (mem_size->incr != 0)
> > +		var_entry = mem_size;
> > +
> > +	if (buf_size->incr != 0)
> > +		var_entry = buf_size;
> > +
> > +	if (ring_size->incr != 0)
> > +		var_entry = ring_size;
> > +
> > +	if (kick_batch->incr != 0)
> > +		var_entry = kick_batch;
> > +
> > +	case_cfg->scenario_id = 0;
> > +
> > +	output_header(case_id, case_cfg);
> > +
> > +	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry-
> > >last;) {
> > +		case_cfg->scenario_id++;
> > +		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
> > +
> > +		run_test_case(case_cfg);
> > +		output_csv(false);
> > +
> > +		if (var_entry->op == OP_ADD)
> > +			var_entry->cur += var_entry->incr;
> > +		else if (var_entry->op == OP_MUL)
> > +			var_entry->cur *= var_entry->incr;
> > +		else
> > +			break;
> > +	}
> > +}
> > +
> > +static int
> > +parse_lcore(struct test_configure *test_case, const char *value) {
> > +	size_t len = strlen(value);
> > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > +	strcpy(input, value);
> > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > >lcore_dma_map);
> > +
> > +	if (test_case == NULL || value == NULL)
> > +		return -1;
> > +
> > +	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
> > +
> > +	char *token = strtok(input, ", ");
> > +	while (token != NULL) {
> > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > +			free(input);
> > +			return -1;
> > +		}
> > +
> > +		uint16_t lcore_id = atoi(token);
> > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
> > +
> > +		token = strtok(NULL, ", ");
> > +	}
> > +
> > +	free(input);
> > +	return 0;
> > +}
> > +
> > +static int
> > +parse_lcore_dma(struct test_configure *test_case, const char *value) {
> > +	struct lcore_dma_map_t *lcore_dma_map;
> > +	char *input = strndup(value, strlen(value) + 1);
> > +	char *addrs = input;
> > +	char *ptrs[2];
> > +	char *start, *end, *substr;
> > +	uint16_t lcore_id;
> > +	int ret = 0;
> > +
> > +	while (*addrs == '\0')
> > +		addrs++;
> > +	if (*addrs == '\0') {
> > +		fprintf(stderr, "No input DMA addresses\n");
> > +		ret = -1;
> > +		goto out;
> > +	}
> > +
> > +	substr = strtok(addrs, ",");
> > +	if (substr == NULL) {
> > +		fprintf(stderr, "No input DMA address\n");
> > +		ret = -1;
> > +		goto out;
> > +	}
> > +
> > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > lcore_dma_map_t));
> > +
> > +	do {
> > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > +
> > +		start = strstr(ptrs[0], "lcore");
> > +		if (start == NULL) {
> > +			fprintf(stderr, "Illegal lcore\n");
> > +			ret = -1;
> > +			break;
> > +		}
> > +
> > +		start += 5;
> > +		lcore_id = strtol(start, &end, 0);
> > +		if (end == start) {
> > +			fprintf(stderr, "No input lcore ID or ID %d is
> > wrong\n", lcore_id);
> > +			ret = -1;
> > +			break;
> > +		}
> > +
> > +		lcore_dma_map = &test_case->lcore_dma_map;
> > +		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
> > +		strcpy(lcore_dma_map->dma_names[lcore_dma_map-
> > >cnt], ptrs[1]);
> > +		lcore_dma_map->cnt++;
> > +		substr = strtok(NULL, ",");
> > +	} while (substr != NULL);
> > +
> > +out:
> > +	free(input);
> > +	return ret;
> > +}
> > +
> > +static int
> > +parse_entry(const char *value, struct test_configure_entry *entry) {
> > +	char input[255] = {0};
> > +	char *args[MAX_PARAMS_PER_ENTRY];
> > +	int args_nr = -1;
> > +
> > +	if (value == NULL || entry == NULL)
> > +		goto out;
> > +
> > +	strncpy(input, value, 254);
> > +	if (*input == '\0')
> > +		goto out;
> > +
> > +	args_nr = rte_strsplit(input, strlen(input), args,
> > MAX_PARAMS_PER_ENTRY, ',');
> > +	if (args_nr != 1 && args_nr != 4)
> > +		goto out;
> > +
> > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > +
> > +	if (args_nr == 4) {
> > +		entry->last = (uint32_t)atoi(args[1]);
> > +		entry->incr = (uint32_t)atoi(args[2]);
> > +		if (!strcmp(args[3], "MUL"))
> > +			entry->op = OP_MUL;
> > +		else if (!strcmp(args[3], "ADD"))
> > +			entry->op = OP_ADD;
> > +		else {
> > +			printf("Invalid op %s.\n", args[3]);
> > +			args_nr = -1;
> > +		}
> > +	} else {
> > +		entry->op = OP_NONE;
> > +		entry->last = 0;
> > +		entry->incr = 0;
> > +	}
> > +out:
> > +	return args_nr;
> > +}
> > +
> > +static uint16_t
> > +load_configs(const char *path)
> > +{
> > +	struct rte_cfgfile *cfgfile;
> > +	int nb_sections, i;
> > +	struct test_configure *test_case;
> > +	char section_name[CFG_NAME_LEN];
> > +	const char *case_type;
> > +	const char *lcore_dma;
> > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > *kick_batch_str;
> > +	int args_nr, nb_vp;
> > +	bool is_dma;
> > +
> > +	printf("config file parsing...\n");
> > +	cfgfile = rte_cfgfile_load(path, 0);
> > +	if (!cfgfile) {
> > +		printf("Open configure file error.\n");
> > +		exit(1);
> > +	}
> > +
> > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > +	if (nb_sections > MAX_TEST_CASES) {
> > +		printf("Error: The maximum number of cases is %d.\n",
> > MAX_TEST_CASES);
> > +		exit(1);
> > +	}
> > +
> > +	for (i = 0; i < nb_sections; i++) {
> > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
> > +		test_case = &test_cases[i];
> > +		case_type = rte_cfgfile_get_entry(cfgfile, section_name,
> > "type");
> > +		if (!case_type) {
> > +			printf("Error: No case type in case %d, the test will be
> > finished here.\n",
> > +				i + 1);
> > +			test_case->is_valid = false;
> > +			continue;
> > +		}
> > +
> > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > +			test_case->test_type =
> > TEST_TYPE_DMA_MEM_COPY;
> > +			test_case->test_type_str = DMA_MEM_COPY;
> > +			is_dma = true;
> > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > +			test_case->test_type =
> > TEST_TYPE_CPU_MEM_COPY;
> > +			test_case->test_type_str = CPU_MEM_COPY;
> > +			is_dma = false;
> > +		} else {
> > +			printf("Error: Cannot find case type %s in case%d.\n",
> > case_type, i + 1);
> > +			test_case->is_valid = false;
> > +			continue;
> > +		}
> > +
> > +		nb_vp = 0;
> > +
> > +		test_case->src_numa_node =
> > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +
> > 	section_name, "src_numa_node"));
> > +		test_case->dst_numa_node =
> > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +
> > 	section_name, "dst_numa_node"));
> > +
> > +		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> > "mem_size");
> > +		args_nr = parse_entry(mem_size_str, &test_case-
> > >mem_size);
> > +		if (args_nr < 0) {
> > +			printf("parse error in case %d.\n", i + 1);
> > +			test_case->is_valid = false;
> > +			continue;
> > +		} else if (args_nr > 1)
> > +			nb_vp++;
> > +
> > +		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> > "buf_size");
> > +		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
> > +		if (args_nr < 0) {
> > +			printf("parse error in case %d.\n", i + 1);
> > +			test_case->is_valid = false;
> > +			continue;
> > +		} else if (args_nr > 1)
> > +			nb_vp++;
> > +
> > +		if (is_dma) {
> > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > section_name,
> > +
> > 	"dma_ring_size");
> > +			args_nr = parse_entry(ring_size_str, &test_case-
> > >ring_size);
> > +			if (args_nr < 0) {
> > +				printf("parse error in case %d.\n", i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			} else if (args_nr > 1)
> > +				nb_vp++;
> > +
> > +			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
> > section_name, "kick_batch");
> > +			args_nr = parse_entry(kick_batch_str, &test_case-
> > >kick_batch);
> > +			if (args_nr < 0) {
> > +				printf("parse error in case %d.\n", i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			} else if (args_nr > 1)
> > +				nb_vp++;
> > +
> > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > section_name, "lcore_dma");
> > +			int lcore_ret = parse_lcore_dma(test_case,
> > lcore_dma);
> > +			if (lcore_ret < 0) {
> > +				printf("parse lcore dma error in case %d.\n", i
> 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			}
> > +		} else {
> > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > section_name, "lcore");
> > +			int lcore_ret = parse_lcore(test_case, lcore_dma);
> > +			if (lcore_ret < 0) {
> > +				printf("parse lcore error in case %d.\n", i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			}
> > +		}
> > +
> > +		if (nb_vp > 1) {
> > +			printf("Error, each section can only have a single
> > variable parameter.\n");
> > +			test_case->is_valid = false;
> > +			continue;
> > +		}
> > +
> > +		test_case->cache_flush =
> > +			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name,
> > "cache_flush"));
> > +		test_case->test_secs =
> > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > +					section_name, "test_seconds"));
> > +
> > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > section_name, "eal_args");
> > +		test_case->is_valid = true;
> > +	}
> > +
> > +	rte_cfgfile_close(cfgfile);
> > +	printf("config file parsing complete.\n\n");
> > +	return i;
> > +}
> > +
> > +/* Parse the argument given in the command line of the application */
> > +static int append_eal_args(int argc, char **argv, const char
> > +*eal_args, char **new_argv) {
> > +	int i;
> > +	char *tokens[MAX_EAL_PARAM_NB];
> > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > +	int token_nb, new_argc = 0;
> > +
> > +	for (i = 0; i < argc; i++) {
> > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > +				(strcmp(argv[i], CMDLINE_RESULT_ARG) ==
> > 0)) {
> > +			i++;
> > +			continue;
> > +		}
> > +		strlcpy(new_argv[new_argc], argv[i],
> > sizeof(new_argv[new_argc]));
> > +		new_argc++;
> > +	}
> > +
> > +	if (eal_args) {
> > +		strlcpy(args, eal_args, sizeof(args));
> > +		token_nb = rte_strsplit(args, strlen(args),
> > +					tokens, MAX_EAL_PARAM_NB, ' ');
> > +		for (i = 0; i < token_nb; i++)
> > +			strcpy(new_argv[new_argc++], tokens[i]);
> > +	}
> > +
> > +	return new_argc;
> > +}
> > +
> > +int
> > +main(int argc, char *argv[])
> > +{
> > +	int ret;
> > +	uint16_t case_nb;
> > +	uint32_t i, nb_lcores;
> > +	pid_t cpid, wpid;
> > +	int wstatus;
> > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > +	char *pargs[MAX_EAL_PARAM_NB];
> > +	char *cfg_path_ptr = NULL;
> > +	char *rst_path_ptr = NULL;
> > +	char rst_path[PATH_MAX];
> > +	int new_argc;
> > +	bool is_first_case = true;
> > +
> > +	memset(args, 0, sizeof(args));
> > +
> > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > +		pargs[i] = args[i];
> > +
> > +	for (i = 0; i < (uint32_t)argc; i++) {
> > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > MAX_LONG_OPT_SZ) == 0)
> > +			cfg_path_ptr = argv[i + 1];
> > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > MAX_LONG_OPT_SZ) == 0)
> > +			rst_path_ptr = argv[i + 1];
> > +	}
> > +	if (cfg_path_ptr == NULL) {
> > +		printf("Config file not assigned.\n");
> > +		return -1;
> > +	}
> > +	if (rst_path_ptr == NULL) {
> > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > +		strcat(strtok(basename(rst_path), "."), "_result.csv");
> > +		rst_path_ptr = rst_path;
> > +	}
> > +
> > +	case_nb = load_configs(cfg_path_ptr);
> > +	fd = fopen(rst_path_ptr, "w");
> > +	if (fd == NULL) {
> > +		printf("Open output CSV file error.\n");
> > +		return -1;
> > +	}
> > +	fclose(fd);
> > +
> > +	for (i = 0; i < case_nb; i++) {
> > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > +			printf("No test type in test case %d.\n\n", i + 1);
> > +			continue;
> > +		}
> > +		if (!test_cases[i].is_valid) {
> > +			printf("Invalid test case %d.\n\n", i + 1);
> > +			continue;
> > +		}
> > +
> > +		cpid = fork();
> 
> [Anoob] Do we really need fork()? Can't we use code like,
> 
> 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> 			ret |= rte_eal_wait_lcore(lcore_id);
> 		}
> 
> to wait for all threads to exit?

[Cheng] Good question. Fork() is used here to establish a new process for the new test case. In order for each test case to have a new EAL environment (for the flexibility), the EAL must be reinitialized for each case. However, the EAL parameters can only be initialized once per process. Therefore, we use a new process to run each new test case. Moreover, each test case runs sequentially and does not affect the others, ensuring the accuracy of the performance data. Your code would wait for all threads to exit in the same process. However, it would not provide a "clean" environment for each test case like fork() does. Fork() allows us to have a fully reinitialized environment, with no impact or side effects from previous test cases. This results in clean, precise performance data for each case.

Please let me know your thoughts on this. And please let me know if you have any other questions or require any clarification.

Thanks,
Cheng

> 
> > +		if (cpid < 0) {
> > +			printf("Fork case %d failed.\n", i + 1);
> > +			exit(EXIT_FAILURE);
> > +		} else if (cpid == 0) {
> > +			printf("\nRunning case %u\n\n", i + 1);
> > +
> > +			new_argc = append_eal_args(argc, argv,
> > test_cases[i].eal_args, pargs);
> > +			ret = rte_eal_init(new_argc, pargs);
> > +			if (ret < 0)
> > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > arguments\n");
> > +
> > +			/* Check lcores. */
> > +			nb_lcores = rte_lcore_count();
> > +			if (nb_lcores < 2)
> > +				rte_exit(EXIT_FAILURE,
> > +					"There should be at least 2 worker
> > lcores.\n");
> > +
> > +			fd = fopen(rst_path_ptr, "a");
> > +			if (!fd) {
> > +				printf("Open output CSV file error.\n");
> > +				return 0;
> > +			}
> > +
> > +			if (is_first_case) {
> > +				output_env_info();
> > +				is_first_case = false;
> > +			}
> > +			run_test(i + 1, &test_cases[i]);
> > +
> > +			/* clean up the EAL */
> > +			rte_eal_cleanup();
> > +
> > +			fclose(fd);
> > +
> > +			printf("\nCase %u completed.\n\n", i + 1);
> > +
> > +			exit(EXIT_SUCCESS);
> > +		} else {
> > +			wpid = waitpid(cpid, &wstatus, 0);
> > +			if (wpid == -1) {
> > +				printf("waitpid error.\n");
> > +				exit(EXIT_FAILURE);
> > +			}
> > +
> > +			if (WIFEXITED(wstatus))
> > +				printf("Case process exited. status %d\n\n",
> > +					WEXITSTATUS(wstatus));
> > +			else if (WIFSIGNALED(wstatus))
> > +				printf("Case process killed by signal %d\n\n",
> > +					WTERMSIG(wstatus));
> > +			else if (WIFSTOPPED(wstatus))
> > +				printf("Case process stopped by signal
> > %d\n\n",
> > +					WSTOPSIG(wstatus));
> > +			else if (WIFCONTINUED(wstatus))
> > +				printf("Case process continued.\n\n");
> > +			else
> > +				printf("Case process unknown
> > terminated.\n\n");
> > +		}
> > +	}
> > +
> > +	printf("Bye...\n");
> > +	return 0;
> > +}
> > +
> > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new
> > file mode 100644 index 0000000000..215ac42673
> > --- /dev/null
> > +++ b/app/test-dma-perf/main.h
> > @@ -0,0 +1,69 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2023 Intel Corporation  */
> > +
> > +#ifndef _MAIN_H_
> > +#define _MAIN_H_
> > +
> > +
> > +#include <rte_common.h>
> > +#include <rte_cycles.h>
> > +#include <rte_dev.h>
> > +#include <rte_dmadev.h>
> > +
> > +#ifndef __maybe_unused
> > +#define __maybe_unused	__rte_unused
> > +#endif
> > +
> > +#define MAX_WORKER_NB 128
> > +#define MAX_OUTPUT_STR_LEN 512
> > +
> > +#define MAX_DMA_NB 128
> > +#define MAX_LCORE_NB 256
> > +
> > +extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > +
> > +typedef enum {
> > +	OP_NONE = 0,
> > +	OP_ADD,
> > +	OP_MUL
> > +} alg_op_type;
> > +
> > +struct test_configure_entry {
> > +	uint32_t first;
> > +	uint32_t last;
> > +	uint32_t incr;
> > +	alg_op_type op;
> > +	uint32_t cur;
> > +};
> > +
> > +struct lcore_dma_map_t {
> > +	uint32_t lcores[MAX_WORKER_NB];
> > +	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > +	int16_t dma_ids[MAX_WORKER_NB];
> > +	uint16_t cnt;
> > +};
> > +
> > +struct test_configure {
> > +	bool is_valid;
> > +	uint8_t test_type;
> > +	const char *test_type_str;
> > +	uint16_t src_numa_node;
> > +	uint16_t dst_numa_node;
> > +	uint16_t opcode;
> > +	bool is_dma;
> > +	struct lcore_dma_map_t lcore_dma_map;
> > +	struct test_configure_entry mem_size;
> > +	struct test_configure_entry buf_size;
> > +	struct test_configure_entry ring_size;
> > +	struct test_configure_entry kick_batch;
> > +	uint32_t cache_flush;
> > +	uint32_t nr_buf;
> > +	uint16_t test_secs;
> > +	const char *eal_args;
> > +	uint8_t scenario_id;
> > +};
> > +
> > +void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> > +
> > +#endif /* _MAIN_H_ */
> > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > --- /dev/null
> > +++ b/app/test-dma-perf/meson.build
> > @@ -0,0 +1,17 @@
> > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > +Intel Corporation
> > +
> > +# meson file, for building this app as part of a main DPDK build.
> > +
> > +if is_windows
> > +    build = false
> > +    reason = 'not supported on Windows'
> > +    subdir_done()
> > +endif
> > +
> > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > +
> > +sources = files(
> > +        'main.c',
> > +        'benchmark.c',
> > +)
> > --
> > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-15  8:01     ` Jiang, Cheng1
@ 2023-06-15  8:44       ` Anoob Joseph
  2023-06-15 14:05         ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-15  8:44 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Cheng,

Please see inline.

Thanks,
Anoob

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Thursday, June 15, 2023 1:31 PM
> To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> Prakash Shukla <amitprakashs@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Thanks for your comments, the replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Thursday, June 15, 2023 1:22 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> > Prakash Shukla <amitprakashs@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi,
> >
> > Thanks for working on the comments. Few more top level comment inline.
> >
> > Thanks,
> > Anoob
> >
> > > -----Original Message-----
> > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > Sent: Tuesday, June 13, 2023 10:02 AM
> > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash
> Shukla
> > > <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>
> > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> xingguang.he@intel.com;
> > > Cheng Jiang <cheng1.jiang@intel.com>
> > > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > -- There are many high-performance DMA devices supported in DPDK
> > > now, and these DMA devices can also be integrated into other modules
> > > of DPDK as accelerators, such as Vhost. Before integrating DMA into
> > > applications, developers need to know the performance of these DMA
> > > devices in various scenarios and the performance of CPUs in the same
> > > scenario, such as different buffer lengths. Only in this way can we
> > > know the target performance of the application accelerated by using
> > > them. This patch introduces a high-performance testing tool, which
> > > supports comparing the performance of CPU and DMA in different
> > > scenarios automatically with a pre- set config file. Memory Copy
> > > performance test are
> > supported for now.
> > >
> > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > ---
> > > v6:
> > >   improved code based on Anoob's comments;
> > >   fixed some code structure issues;
> > > v5:
> > >   fixed some LONG_LINE warnings;
> > > v4:
> > >   fixed inaccuracy of the memory footprint display;
> > > v3:
> > >   fixed some typos;
> > > v2:
> > >   added lcore/dmadev designation;
> > >   added error case process;
> > >   removed worker_threads parameter from config.ini;
> > >   improved the logs;
> > >   improved config file;
> > >
> > >  app/meson.build               |   1 +
> > >  app/test-dma-perf/benchmark.c | 477
> ++++++++++++++++++++++++++++
> > > app/test-dma-perf/config.ini  |  59 ++++
> > >  app/test-dma-perf/main.c      | 569
> > > ++++++++++++++++++++++++++++++++++
> > >  app/test-dma-perf/main.h      |  69 +++++
> > >  app/test-dma-perf/meson.build |  17 +
> > >  6 files changed, 1192 insertions(+)  create mode 100644
> > > app/test-dma-perf/benchmark.c  create mode 100644
> > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > mode
> > > 100644 app/test-dma-perf/meson.build
> > >
> > > diff --git a/app/meson.build b/app/meson.build index
> > > 74d2420f67..4fc1a83eba 100644
> > > --- a/app/meson.build
> > > +++ b/app/meson.build
> > > @@ -19,6 +19,7 @@ apps = [
> > >          'test-cmdline',
> > >          'test-compress-perf',
> > >          'test-crypto-perf',
> > > +        'test-dma-perf',
> > >          'test-eventdev',
> > >          'test-fib',
> > >          'test-flow-perf',
> > > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > > perf/benchmark.c new file mode 100644 index 0000000000..bc1ca82297
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/benchmark.c
> > > @@ -0,0 +1,477 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright(c) 2023 Intel Corporation  */
> > > +
> > > +#include <inttypes.h>
> > > +#include <stdio.h>
> > > +#include <stdlib.h>
> > > +#include <unistd.h>
> > > +
> > > +#include <rte_time.h>
> > > +#include <rte_mbuf.h>
> > > +#include <rte_dmadev.h>
> > > +#include <rte_malloc.h>
> > > +#include <rte_lcore.h>
> > > +
> > > +#include "main.h"
> > > +
> > > +#define MAX_DMA_CPL_NB 255
> > > +
> > > +#define TEST_WAIT_U_SECOND 10000
> > > +
> > > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%"
> PRIu64
> > > ",%.3lf,%.3lf\n"
> > > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%"
> PRIu64
> > > ",%.3lf,%.3lf\n"
> > > +
> > > +struct worker_info {
> > > +	bool ready_flag;
> > > +	bool start_flag;
> > > +	bool stop_flag;
> > > +	uint32_t total_cpl;
> > > +	uint32_t test_cpl;
> > > +};
> > > +
> > > +struct lcore_params {
> > > +	uint8_t scenario_id;
> > > +	unsigned int lcore_id;
> > > +	char *dma_name;
> > > +	uint16_t worker_id;
> > > +	uint16_t dev_id;
> > > +	uint32_t nr_buf;
> > > +	uint16_t kick_batch;
> > > +	uint32_t buf_size;
> > > +	uint16_t test_secs;
> > > +	struct rte_mbuf **srcs;
> > > +	struct rte_mbuf **dsts;
> > > +	struct worker_info worker_info;
> > > +};
> > > +
> > > +static struct rte_mempool *src_pool; static struct rte_mempool
> > > +*dst_pool;
> > > +
> > > +static volatile struct lcore_params
> *worker_params[MAX_WORKER_NB];
> > > +
> > > +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> > > +
> > > +static inline int
> > > +__rte_format_printf(3, 4)
> > > +print_err(const char *func, int lineno, const char *format, ...) {
> > > +	va_list ap;
> > > +	int ret;
> > > +
> > > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > > +	va_start(ap, format);
> > > +	ret += vfprintf(stderr, format, ap);
> > > +	va_end(ap);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static inline void
> > > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t
> > > +nb_workers,
> > > uint16_t test_secs,
> > > +				uint32_t total_cnt, float *memory, uint32_t
> > > *ave_cycle,
> > > +				float *bandwidth, float *mops)
> > > +{
> > > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 *
> > > 1024);
> > > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > > (float)*ave_cycle)) / 1000000000;
> > > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > > +
> > > +static void
> > > +output_result(uint8_t scenario_id, uint32_t lcore_id, char
> > > +*dma_name,
> > > uint64_t ave_cycle,
> > > +			uint32_t buf_size, uint32_t nr_buf, float memory,
> > > +			float bandwidth, float mops, bool is_dma) {
> > > +	if (is_dma)
> > > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > > +	else
> > > +		printf("lcore %u\n", lcore_id);
> > > +
> > > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf:
> > > +%u,
> > > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > > +			ave_cycle, buf_size, nr_buf, memory,
> > > rte_get_timer_hz());
> > > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth,
> > > +mops);
> > > +
> > > +	if (is_dma)
> > > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > > CSV_LINE_DMA_FMT,
> > > +			scenario_id, lcore_id, dma_name, buf_size,
> > > +			nr_buf, memory, ave_cycle, bandwidth, mops);
> > > +	else
> > > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > > CSV_LINE_CPU_FMT,
> > > +			scenario_id, lcore_id, buf_size,
> > > +			nr_buf, memory, ave_cycle, bandwidth, mops); }
> > > +
> > > +static inline void
> > > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > > +		__maybe_unused uint32_t buf_size,
> > > +		__maybe_unused uint32_t nr_buf)
> > > +{
> > > +#ifdef RTE_ARCH_X86_64
> > > +	char *data;
> > > +	struct rte_mbuf **srcs = array;
> > > +	uint32_t i, offset;
> > > +
> > > +	for (i = 0; i < nr_buf; i++) {
> > > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > > +		for (offset = 0; offset < buf_size; offset += 64)
> > > +			__builtin_ia32_clflush(data + offset);
> > > +	}
> > > +#endif
> > > +}
> > > +
> > > +/* Configuration of device. */
> > > +static void
> > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > > +	uint16_t vchan = 0;
> > > +	struct rte_dma_info info;
> > > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > +	struct rte_dma_vchan_conf qconf = {
> > > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > +		.nb_desc = ring_size
> > > +	};
> > > +
> > > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > > +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> > > +
> > > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > > +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> > > +
> > > +	rte_dma_info_get(dev_id, &info);
> > > +	if (info.nb_vchans != 1)
> > > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > > reported on device id. %u\n",
> > > +				dev_id);
> > > +
> > > +	if (rte_dma_start(dev_id) != 0)
> > > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > > +
> > > +static int
> > > +config_dmadevs(struct test_configure *cfg) {
> > > +	uint32_t ring_size = cfg->ring_size.cur;
> > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > +	uint32_t nb_workers = ldm->cnt;
> > > +	uint32_t i;
> > > +	int dev_id;
> > > +	uint16_t nb_dmadevs = 0;
> > > +	char *dma_name;
> > > +
> > > +	for (i = 0; i < ldm->cnt; i++) {
> > > +		dma_name = ldm->dma_names[i];
> > > +		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> > > +		if (dev_id == -1) {
> > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > dma_name);
> > > +			goto end;
> > > +		}
> > > +
> > > +		ldm->dma_ids[i] = dev_id;
> > > +		configure_dmadev_queue(dev_id, ring_size);
> > > +		++nb_dmadevs;
> > > +	}
> > > +
> > > +end:
> > > +	if (nb_dmadevs < nb_workers) {
> > > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> > > nb_dmadevs, nb_workers);
> > > +		return -1;
> > > +	}
> > > +
> > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +#define POLL_MAX 1000
> > > +
> > > +
> >
> > [Anoob] Extra blank line. You can consider removing.
> 
> [Cheng] sure, sorry for the miss.
> 
> >
> > > +static inline void
> > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > > +			volatile struct worker_info *worker_info) {
> > > +	int ret;
> > > +	uint16_t nr_cpl;
> > > +
> > > +	ret = rte_dma_submit(dev_id, 0);
> > > +	if (ret < 0) {
> > > +		rte_dma_stop(dev_id);
> > > +		rte_dma_close(dev_id);
> > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > +	}
> > > +
> > > +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> > > NULL);
> > > +	*async_cnt -= nr_cpl;
> > > +	worker_info->total_cpl += nr_cpl;
> > > +}
> > > +
> > > +static inline int
> > > +do_dma_mem_copy(void *p)
> > > +{
> > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > +	const uint16_t dev_id = para->dev_id;
> > > +	const uint32_t nr_buf = para->nr_buf;
> > > +	const uint16_t kick_batch = para->kick_batch;
> > > +	const uint32_t buf_size = para->buf_size;
> > > +	struct rte_mbuf **srcs = para->srcs;
> > > +	struct rte_mbuf **dsts = para->dsts;
> > > +	uint16_t nr_cpl;
> > > +	uint64_t async_cnt = 0;
> > > +	uint32_t i;
> > > +	uint32_t poll_cnt = 0;
> > > +	int ret;
> > > +
> > > +	worker_info->stop_flag = false;
> > > +	worker_info->ready_flag = true;
> > > +
> > > +	while (!worker_info->start_flag)
> > > +		;
> > > +
> > > +	while (1) {
> > > +		for (i = 0; i < nr_buf; i++) {
> > > +dma_copy:
> > > +			ret = rte_dma_copy(dev_id, 0,
> > > rte_pktmbuf_iova(srcs[i]),
> > > +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> > > +			if (unlikely(ret < 0)) {
> > > +				if (ret == -ENOSPC) {
> > > +					do_dma_submit_and_poll(dev_id,
> > > &async_cnt, worker_info);
> > > +					goto dma_copy;
> > > +				} else {
> > > +					/* Error exit */
> > > +					rte_dma_stop(dev_id);
> > > +					rte_exit(EXIT_FAILURE, "DMA
> > > enqueue failed\n");
> > > +				}
> > > +			}
> > > +			async_cnt++;
> > > +
> > > +			if ((async_cnt % kick_batch) == 0)
> > > +				do_dma_submit_and_poll(dev_id,
> > > &async_cnt, worker_info);
> > > +		}
> > > +
> > > +		if (worker_info->stop_flag)
> > > +			break;
> > > +	}
> > > +
> > > +	rte_dma_submit(dev_id, 0);
> > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > +		async_cnt -= nr_cpl;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static inline int
> > > +do_cpu_mem_copy(void *p)
> > > +{
> > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > +	const uint32_t nr_buf = para->nr_buf;
> > > +	const uint32_t buf_size = para->buf_size;
> > > +	struct rte_mbuf **srcs = para->srcs;
> > > +	struct rte_mbuf **dsts = para->dsts;
> > > +	uint32_t i;
> > > +
> > > +	worker_info->stop_flag = false;
> > > +	worker_info->ready_flag = true;
> > > +
> > > +	while (!worker_info->start_flag)
> > > +		;
> > > +
> > > +	while (1) {
> > > +		for (i = 0; i < nr_buf; i++) {
> > > +			/* copy buffer form src to dst */
> > > +			rte_memcpy((void
> > > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > > +				(void
> > > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > > +				(size_t)buf_size);
> > > +			worker_info->total_cpl++;
> > > +		}
> > > +		if (worker_info->stop_flag)
> > > +			break;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int
> > > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf
> ***srcs,
> > > +			struct rte_mbuf ***dsts)
> > > +{
> > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > +	unsigned int nr_sockets;
> > > +	uint32_t nr_buf = cfg->nr_buf;
> > > +
> > > +	nr_sockets = rte_socket_count();
> > > +	if (cfg->src_numa_node >= nr_sockets ||
> > > +		cfg->dst_numa_node >= nr_sockets) {
> > > +		printf("Error: Source or destination numa exceeds the acture
> > > numa nodes.\n");
> > > +		return -1;
> > > +	}
> > > +
> > > +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > > +			nr_buf, /* n == num elements */
> > > +			64,  /* cache size */
> > > +			0,   /* priv size */
> > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > +			cfg->src_numa_node);
> > > +	if (src_pool == NULL) {
> > > +		PRINT_ERR("Error with source mempool creation.\n");
> > > +		return -1;
> > > +	}
> > > +
> > > +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > > +			nr_buf, /* n == num elements */
> > > +			64,  /* cache size */
> >
> > [Anoob] We do not alloc or free pointers in the datapath, right? So
> > why bother with cache?
> 
> [Cheng] Yes, you are right, the cache size is not necessary here, I'll fix it in the
> next version.
> 
> >
> > > +			0,   /* priv size */
> > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > +			cfg->dst_numa_node);
> > > +	if (dst_pool == NULL) {
> > > +		PRINT_ERR("Error with destination mempool creation.\n");
> > > +		return -1;
> > > +	}
> > > +
> > > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > > +	if (*srcs == NULL) {
> > > +		printf("Error: srcs malloc failed.\n");
> > > +		return -1;
> > > +	}
> >
> > [Anoob] Are we freeing these memory? The ones allocated with
> rte_malloc.
> 
> [Cheng] yes, we freed the memory in the end of mem_copy_benchmark()
> when we finished the test.

[Anoob] I think we are not freeing this mem. In the place where we free all mem, we do free all objects to mempool as well as the mempools. But this memory is to hold the pointers, right? Is that getting freed anywhere?

Also, in the mem clearing paths, do we need to clear the static variables (ie, set srcs, src_pool, dsts, dst_pool to NULL) so that there won't be any scope for any double free.

> 
> >
> > > +
> > > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > > +	if (*dsts == NULL) {
> > > +		printf("Error: dsts malloc failed.\n");
> > > +		return -1;
> > > +	}
> > > +
> > > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
> > > +		printf("get src mbufs failed.\n");
> > > +		return -1;
> > > +	}
> > > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
> > > +		printf("get dst mbufs failed.\n");
> > > +		return -1;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +void
> > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > +	uint16_t i;
> > > +	uint32_t offset;
> > > +	unsigned int lcore_id = 0;
> > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024)
> > > +/
> > > (cfg->buf_size.cur * 2);
> > > +	uint16_t nb_workers = ldm->cnt;
> > > +	uint16_t test_secs = cfg->test_secs;
> > > +	float memory;
> > > +	uint32_t avg_cycles = 0;
> > > +	float mops;
> > > +	float bandwidth;
> > > +
> > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > +		goto out;
> > > +
> > > +	if (is_dma)
> > > +		if (config_dmadevs(cfg) < 0)
> > > +			goto out;
> > > +
> > > +	if (cfg->cache_flush) {
> > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > +		rte_mb();
> > > +	}
> > > +
> > > +	printf("Start testing....\n");
> > > +
> > > +	for (i = 0; i < nb_workers; i++) {
> > > +		lcore_id = ldm->lcores[i];
> > > +		offset = nr_buf / nb_workers * i;
> > > +
> > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > lcore_params), 0);
> > > +		if (!worker_params[i]) {
> > > +			printf("lcore parameters malloc failure for lcore
> > > %d\n", lcore_id);
> > > +			break;
> > > +		}
> >
> > [Anoob] Are we freeing the above memory?
> 
> [Cheng] sorry, I missed that, I'll add worker_params memory free in the next
> version, thanks.
> 
> >
> > > +		if (is_dma) {
> > > +			worker_params[i]->dma_name = ldm-
> > > >dma_names[i];
> > > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > > +			worker_params[i]->kick_batch = kick_batch;
> > > +		}
> > > +		worker_params[i]->worker_id = i;
> > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > nb_workers);
> > > +		worker_params[i]->buf_size = buf_size;
> > > +		worker_params[i]->test_secs = test_secs;
> > > +		worker_params[i]->srcs = srcs + offset;
> > > +		worker_params[i]->dsts = dsts + offset;
> > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > +		worker_params[i]->lcore_id = lcore_id;
> > > +
> > > +		if (is_dma)
> > > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > > *)(&i), lcore_id);
> > > +		else
> > > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > > *)(&i), lcore_id);
> > > +	}
> > > +
> > > +	while (1) {
> > > +		bool ready = true;
> > > +		for (i = 0; i < nb_workers; i++) {
> > > +			if (worker_params[i]->worker_info.ready_flag ==
> > > false) {
> > > +				ready = 0;
> > > +				break;
> > > +			}
> > > +		}
> > > +		if (ready)
> > > +			break;
> > > +	}
> > > +
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.start_flag = true;
> > > +
> > > +	usleep(TEST_WAIT_U_SECOND);
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.test_cpl =
> > > +worker_params[i]->worker_info.total_cpl;
> > > +
> > > +	usleep(test_secs * 1000 * 1000);
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.test_cpl =
> > > worker_params[i]->worker_info.total_cpl -
> > > +						worker_params[i]-
> > > >worker_info.test_cpl;
> > > +
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.stop_flag = true;
> > > +
> > > +	rte_eal_mp_wait_lcore();
> > > +
> > > +	for (i = 0; i < nb_workers; i++) {
> > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > +			worker_params[i]->worker_info.test_cpl,
> > > +			&memory, &avg_cycles, &bandwidth, &mops);
> > > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > > +					worker_params[i]->dma_name,
> > > avg_cycles, buf_size,
> > > +					nr_buf / nb_workers, memory,
> > > bandwidth, mops, is_dma);
> > > +	}
> > > +
> > > +out:
> > > +	/* free env */
> > > +	if (srcs)
> > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > +	if (dsts)
> > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > +
> > > +	if (src_pool)
> > > +		rte_mempool_free(src_pool);
> > > +	if (dst_pool)
> > > +		rte_mempool_free(dst_pool);
> > > +
> > > +	if (is_dma) {
> > > +		for (i = 0; i < nb_workers; i++) {
> > > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > > +			rte_dma_stop(ldm->dma_ids[i]);
> > > +		}
> > > +	}
> > > +}
> > > diff --git a/app/test-dma-perf/config.ini
> > > b/app/test-dma-perf/config.ini new file mode 100644 index
> > > 0000000000..2fd9c3c387
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/config.ini
> > > @@ -0,0 +1,59 @@
> > > +
> > > +; This is an example configuration file for dma-perf, which details
> > > +the meanings of each parameter ; and instructions on how to use dma-
> perf.
> > > +
> > > +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
> > > +
> > > +; Parameters:
> > > +; "mem_size" denotes the size of the memory footprint.
> > > +; "buf_size" denotes the memory size of a single operation.
> > > +; "dma_ring_size" denotes the dma ring buffer size. It should be
> > > +greater
> > > than 64 normally.
> > > +; "kick_batch" denotes the dma operation batch size, and should be
> > > +greater
> > > than 1 normally.
> > > +
> > > +; The format for variables is variable=first,last,increment,ADD|MUL.
> > > +
> > > +; src_numa_node is used to control the numa node where the source
> > > memory is allocated.
> > > +; dst_numa_node is used to control the numa node where the
> > > +destination
> > > memory is allocated.
> > > +
> > > +; cache_flush is used to determine whether or not the cache should
> > > +be flushed, with 1 indicating to ; flush and 0 indicating to not flush.
> > > +
> > > +; test_seconds controls the test time of the whole case.
> > > +
> > > +; To use DMA for a test, please specify the "lcore_dma" parameter.
> > > +; If you have already set the "-l" and "-a" parameters using EAL, ;
> > > +make sure that the value of "lcore_dma" falls within their range of
> > > +the
> > > values.
> > > +
> > > +; To use CPU for a test, please specify the "lcore" parameter.
> > > +; If you have already set the "-l" and "-a" parameters using EAL, ;
> > > +make sure that the value of "lcore" falls within their range of values.
> > > +
> > > +; To specify a configuration file, use the "--config" flag followed
> > > +by the path
> > > to the file.
> > > +
> > > +; To specify a result file, use the "--result" flag followed by the
> > > +path to the
> > > file.
> > > +; If you do not specify a result file, one will be generated with
> > > +the same name as the configuration ; file, with the addition of
> > > +"_result.csv" at
> > > the end.
> > > +
> > > +[case1]
> > > +type=DMA_MEM_COPY
> > > +mem_size=10
> > > +buf_size=64,8192,2,MUL
> > > +dma_ring_size=1024
> > > +kick_batch=32
> > > +src_numa_node=0
> > > +dst_numa_node=0
> > > +cache_flush=0
> > > +test_seconds=2
> > > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> >
> > [Anoob] Isn't it better if we allow user to specify DMA dev ID rather
> > than the PCI DBDF?
> >
> > In the long run, I would expect config file to provide {core,
> > dma_dev_id, queue_id}
> >
> > Another thought is why to expose this at all? If we can restrict this
> > perf application to have one thread only use one vchan, then
> > application can easily create this mapping in run time. Unless you
> > want one thread to use 2 different vchans which may not be desirable
> since this is a standalone perf app.
> 
> [Cheng] Thank you for the feedback.
> Here are my thoughts:
> Firstly, the user may not know which device the DMA dev ID corresponds to,
> or which NUMA node it is on. In my example, I used the CBDMA
> environment, so I did not specify the work queue ID. When using DSA, the
> configuration would be something like lcore10@0000:00:04.2-q0 which
> contains core, dma and work queue id. The reason for exposing these
> options is that we want the user to fully understand which cores and devices
> are being used so that they know exactly where the performance data is
> coming from. For example, performance when cores and DMA devices are
> not on the same NUMA node, etc. This allows the testing scenario to be
> precise and flexible. If the application handles the mapping itself, the user
> loses control over the mapping and may not get the performance data they
> want. We believe control should be given to the user rather than the
> application.

[Anoob] I understand your view points. Thanks for the explanation.

> 
> >
> > > +eal_args=--in-memory --file-prefix=test
> > > +
> > > +[case2]
> > > +type=CPU_MEM_COPY
> > > +mem_size=10
> > > +buf_size=64,8192,2,MUL
> > > +src_numa_node=0
> > > +dst_numa_node=1
> > > +cache_flush=0
> > > +test_seconds=2
> > > +lcore = 3, 4
> > > +eal_args=--in-memory --no-pci
> > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new
> > > file mode 100644 index 0000000000..d65655b87b
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/main.c
> > > @@ -0,0 +1,569 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright(c) 2023 Intel Corporation  */
> > > +
> > > +#include <stdio.h>
> > > +#include <stdlib.h>
> > > +#include <getopt.h>
> > > +#include <signal.h>
> > > +#include <stdbool.h>
> > > +#include <unistd.h>
> > > +#include <sys/wait.h>
> > > +#include <inttypes.h>
> > > +#include <libgen.h>
> > > +
> > > +#include <rte_eal.h>
> > > +#include <rte_cfgfile.h>
> > > +#include <rte_string_fns.h>
> > > +#include <rte_lcore.h>
> > > +
> > > +#include "main.h"
> > > +
> > > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > > +
> > > +#define MAX_EAL_PARAM_NB 100
> > > +#define MAX_EAL_PARAM_LEN 1024
> > > +
> > > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > > +
> > > +#define CMDLINE_CONFIG_ARG "--config"
> > > +#define CMDLINE_RESULT_ARG "--result"
> > > +
> > > +#define MAX_PARAMS_PER_ENTRY 4
> > > +
> > > +#define MAX_LONG_OPT_SZ 64
> > > +
> > > +enum {
> > > +	TEST_TYPE_NONE = 0,
> > > +	TEST_TYPE_DMA_MEM_COPY,
> > > +	TEST_TYPE_CPU_MEM_COPY
> > > +};
> > > +
> > > +#define MAX_TEST_CASES 16
> > > +static struct test_configure test_cases[MAX_TEST_CASES];
> > > +
> > > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > +
> > > +static FILE *fd;
> > > +
> > > +static void
> > > +output_csv(bool need_blankline)
> > > +{
> > > +	uint32_t i;
> > > +
> > > +	if (need_blankline) {
> > > +		fprintf(fd, ",,,,,,,,\n");
> > > +		fprintf(fd, ",,,,,,,,\n");
> > > +	}
> > > +
> > > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > > +		if (output_str[i][0]) {
> > > +			fprintf(fd, "%s", output_str[i]);
> > > +			output_str[i][0] = '\0';
> > > +		}
> > > +	}
> > > +
> > > +	fflush(fd);
> > > +}
> > > +
> > > +static void
> > > +output_env_info(void)
> > > +{
> > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > > environment:\n");
> > > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
> > > +			PRIu64 "\n", rte_get_timer_hz());
> > > +
> > > +	output_csv(true);
> > > +}
> > > +
> > > +static void
> > > +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > > +			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
> > > +
> > > +	output_csv(true);
> > > +}
> > > +
> > > +static void
> > > +run_test_case(struct test_configure *case_cfg) {
> > > +	switch (case_cfg->test_type) {
> > > +	case TEST_TYPE_DMA_MEM_COPY:
> > > +		mem_copy_benchmark(case_cfg, true);
> > > +		break;
> > > +	case TEST_TYPE_CPU_MEM_COPY:
> > > +		mem_copy_benchmark(case_cfg, false);
> > > +		break;
> > > +	default:
> > > +		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> > > +		break;
> > > +	}
> > > +}
> > > +
> > > +static void
> > > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > > +	uint32_t i;
> > > +	uint32_t nb_lcores = rte_lcore_count();
> > > +	struct test_configure_entry *mem_size = &case_cfg->mem_size;
> > > +	struct test_configure_entry *buf_size = &case_cfg->buf_size;
> > > +	struct test_configure_entry *ring_size = &case_cfg->ring_size;
> > > +	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
> > > +	struct test_configure_entry dummy = { 0 };
> > > +	struct test_configure_entry *var_entry = &dummy;
> > > +
> > > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > > +
> > > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > > +		printf("Case %u: Not enough lcores.\n", case_id);
> > > +		return;
> > > +	}
> > > +
> > > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > > +
> > > +	if (mem_size->incr != 0)
> > > +		var_entry = mem_size;
> > > +
> > > +	if (buf_size->incr != 0)
> > > +		var_entry = buf_size;
> > > +
> > > +	if (ring_size->incr != 0)
> > > +		var_entry = ring_size;
> > > +
> > > +	if (kick_batch->incr != 0)
> > > +		var_entry = kick_batch;
> > > +
> > > +	case_cfg->scenario_id = 0;
> > > +
> > > +	output_header(case_id, case_cfg);
> > > +
> > > +	for (var_entry->cur = var_entry->first; var_entry->cur <=
> > > +var_entry-
> > > >last;) {
> > > +		case_cfg->scenario_id++;
> > > +		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
> > > +
> > > +		run_test_case(case_cfg);
> > > +		output_csv(false);
> > > +
> > > +		if (var_entry->op == OP_ADD)
> > > +			var_entry->cur += var_entry->incr;
> > > +		else if (var_entry->op == OP_MUL)
> > > +			var_entry->cur *= var_entry->incr;
> > > +		else
> > > +			break;
> > > +	}
> > > +}
> > > +
> > > +static int
> > > +parse_lcore(struct test_configure *test_case, const char *value) {
> > > +	size_t len = strlen(value);
> > > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > > +	strcpy(input, value);
> > > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > > >lcore_dma_map);
> > > +
> > > +	if (test_case == NULL || value == NULL)
> > > +		return -1;
> > > +
> > > +	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
> > > +
> > > +	char *token = strtok(input, ", ");
> > > +	while (token != NULL) {
> > > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > > +			free(input);
> > > +			return -1;
> > > +		}
> > > +
> > > +		uint16_t lcore_id = atoi(token);
> > > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
> > > +
> > > +		token = strtok(NULL, ", ");
> > > +	}
> > > +
> > > +	free(input);
> > > +	return 0;
> > > +}
> > > +
> > > +static int
> > > +parse_lcore_dma(struct test_configure *test_case, const char *value) {
> > > +	struct lcore_dma_map_t *lcore_dma_map;
> > > +	char *input = strndup(value, strlen(value) + 1);
> > > +	char *addrs = input;
> > > +	char *ptrs[2];
> > > +	char *start, *end, *substr;
> > > +	uint16_t lcore_id;
> > > +	int ret = 0;
> > > +
> > > +	while (*addrs == '\0')
> > > +		addrs++;
> > > +	if (*addrs == '\0') {
> > > +		fprintf(stderr, "No input DMA addresses\n");
> > > +		ret = -1;
> > > +		goto out;
> > > +	}
> > > +
> > > +	substr = strtok(addrs, ",");
> > > +	if (substr == NULL) {
> > > +		fprintf(stderr, "No input DMA address\n");
> > > +		ret = -1;
> > > +		goto out;
> > > +	}
> > > +
> > > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > > lcore_dma_map_t));
> > > +
> > > +	do {
> > > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > > +
> > > +		start = strstr(ptrs[0], "lcore");
> > > +		if (start == NULL) {
> > > +			fprintf(stderr, "Illegal lcore\n");
> > > +			ret = -1;
> > > +			break;
> > > +		}
> > > +
> > > +		start += 5;
> > > +		lcore_id = strtol(start, &end, 0);
> > > +		if (end == start) {
> > > +			fprintf(stderr, "No input lcore ID or ID %d is
> > > wrong\n", lcore_id);
> > > +			ret = -1;
> > > +			break;
> > > +		}
> > > +
> > > +		lcore_dma_map = &test_case->lcore_dma_map;
> > > +		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
> > > +		strcpy(lcore_dma_map->dma_names[lcore_dma_map-
> > > >cnt], ptrs[1]);
> > > +		lcore_dma_map->cnt++;
> > > +		substr = strtok(NULL, ",");
> > > +	} while (substr != NULL);
> > > +
> > > +out:
> > > +	free(input);
> > > +	return ret;
> > > +}
> > > +
> > > +static int
> > > +parse_entry(const char *value, struct test_configure_entry *entry) {
> > > +	char input[255] = {0};
> > > +	char *args[MAX_PARAMS_PER_ENTRY];
> > > +	int args_nr = -1;
> > > +
> > > +	if (value == NULL || entry == NULL)
> > > +		goto out;
> > > +
> > > +	strncpy(input, value, 254);
> > > +	if (*input == '\0')
> > > +		goto out;
> > > +
> > > +	args_nr = rte_strsplit(input, strlen(input), args,
> > > MAX_PARAMS_PER_ENTRY, ',');
> > > +	if (args_nr != 1 && args_nr != 4)
> > > +		goto out;
> > > +
> > > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > > +
> > > +	if (args_nr == 4) {
> > > +		entry->last = (uint32_t)atoi(args[1]);
> > > +		entry->incr = (uint32_t)atoi(args[2]);
> > > +		if (!strcmp(args[3], "MUL"))
> > > +			entry->op = OP_MUL;
> > > +		else if (!strcmp(args[3], "ADD"))
> > > +			entry->op = OP_ADD;
> > > +		else {
> > > +			printf("Invalid op %s.\n", args[3]);
> > > +			args_nr = -1;
> > > +		}
> > > +	} else {
> > > +		entry->op = OP_NONE;
> > > +		entry->last = 0;
> > > +		entry->incr = 0;
> > > +	}
> > > +out:
> > > +	return args_nr;
> > > +}
> > > +
> > > +static uint16_t
> > > +load_configs(const char *path)
> > > +{
> > > +	struct rte_cfgfile *cfgfile;
> > > +	int nb_sections, i;
> > > +	struct test_configure *test_case;
> > > +	char section_name[CFG_NAME_LEN];
> > > +	const char *case_type;
> > > +	const char *lcore_dma;
> > > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > > *kick_batch_str;
> > > +	int args_nr, nb_vp;
> > > +	bool is_dma;
> > > +
> > > +	printf("config file parsing...\n");
> > > +	cfgfile = rte_cfgfile_load(path, 0);
> > > +	if (!cfgfile) {
> > > +		printf("Open configure file error.\n");
> > > +		exit(1);
> > > +	}
> > > +
> > > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > > +	if (nb_sections > MAX_TEST_CASES) {
> > > +		printf("Error: The maximum number of cases is %d.\n",
> > > MAX_TEST_CASES);
> > > +		exit(1);
> > > +	}
> > > +
> > > +	for (i = 0; i < nb_sections; i++) {
> > > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
> > > +		test_case = &test_cases[i];
> > > +		case_type = rte_cfgfile_get_entry(cfgfile, section_name,
> > > "type");
> > > +		if (!case_type) {
> > > +			printf("Error: No case type in case %d, the test will be
> > > finished here.\n",
> > > +				i + 1);
> > > +			test_case->is_valid = false;
> > > +			continue;
> > > +		}
> > > +
> > > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > > +			test_case->test_type =
> > > TEST_TYPE_DMA_MEM_COPY;
> > > +			test_case->test_type_str = DMA_MEM_COPY;
> > > +			is_dma = true;
> > > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > > +			test_case->test_type =
> > > TEST_TYPE_CPU_MEM_COPY;
> > > +			test_case->test_type_str = CPU_MEM_COPY;
> > > +			is_dma = false;
> > > +		} else {
> > > +			printf("Error: Cannot find case type %s in case%d.\n",
> > > case_type, i + 1);
> > > +			test_case->is_valid = false;
> > > +			continue;
> > > +		}
> > > +
> > > +		nb_vp = 0;
> > > +
> > > +		test_case->src_numa_node =
> > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > +
> > > 	section_name, "src_numa_node"));
> > > +		test_case->dst_numa_node =
> > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > +
> > > 	section_name, "dst_numa_node"));
> > > +
> > > +		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> > > "mem_size");
> > > +		args_nr = parse_entry(mem_size_str, &test_case-
> > > >mem_size);
> > > +		if (args_nr < 0) {
> > > +			printf("parse error in case %d.\n", i + 1);
> > > +			test_case->is_valid = false;
> > > +			continue;
> > > +		} else if (args_nr > 1)
> > > +			nb_vp++;
> > > +
> > > +		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> > > "buf_size");
> > > +		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
> > > +		if (args_nr < 0) {
> > > +			printf("parse error in case %d.\n", i + 1);
> > > +			test_case->is_valid = false;
> > > +			continue;
> > > +		} else if (args_nr > 1)
> > > +			nb_vp++;
> > > +
> > > +		if (is_dma) {
> > > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > > section_name,
> > > +
> > > 	"dma_ring_size");
> > > +			args_nr = parse_entry(ring_size_str, &test_case-
> > > >ring_size);
> > > +			if (args_nr < 0) {
> > > +				printf("parse error in case %d.\n", i + 1);
> > > +				test_case->is_valid = false;
> > > +				continue;
> > > +			} else if (args_nr > 1)
> > > +				nb_vp++;
> > > +
> > > +			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
> > > section_name, "kick_batch");
> > > +			args_nr = parse_entry(kick_batch_str, &test_case-
> > > >kick_batch);
> > > +			if (args_nr < 0) {
> > > +				printf("parse error in case %d.\n", i + 1);
> > > +				test_case->is_valid = false;
> > > +				continue;
> > > +			} else if (args_nr > 1)
> > > +				nb_vp++;
> > > +
> > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > section_name, "lcore_dma");
> > > +			int lcore_ret = parse_lcore_dma(test_case,
> > > lcore_dma);
> > > +			if (lcore_ret < 0) {
> > > +				printf("parse lcore dma error in case %d.\n", i
> > 1);
> > > +				test_case->is_valid = false;
> > > +				continue;
> > > +			}
> > > +		} else {
> > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > section_name, "lcore");
> > > +			int lcore_ret = parse_lcore(test_case, lcore_dma);
> > > +			if (lcore_ret < 0) {
> > > +				printf("parse lcore error in case %d.\n", i + 1);
> > > +				test_case->is_valid = false;
> > > +				continue;
> > > +			}
> > > +		}
> > > +
> > > +		if (nb_vp > 1) {
> > > +			printf("Error, each section can only have a single
> > > variable parameter.\n");
> > > +			test_case->is_valid = false;
> > > +			continue;
> > > +		}
> > > +
> > > +		test_case->cache_flush =
> > > +			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name,
> > > "cache_flush"));
> > > +		test_case->test_secs =
> > > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > > +					section_name, "test_seconds"));
> > > +
> > > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > > section_name, "eal_args");
> > > +		test_case->is_valid = true;
> > > +	}
> > > +
> > > +	rte_cfgfile_close(cfgfile);
> > > +	printf("config file parsing complete.\n\n");
> > > +	return i;
> > > +}
> > > +
> > > +/* Parse the argument given in the command line of the application
> > > +*/ static int append_eal_args(int argc, char **argv, const char
> > > +*eal_args, char **new_argv) {
> > > +	int i;
> > > +	char *tokens[MAX_EAL_PARAM_NB];
> > > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > > +	int token_nb, new_argc = 0;
> > > +
> > > +	for (i = 0; i < argc; i++) {
> > > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > > +				(strcmp(argv[i], CMDLINE_RESULT_ARG) ==
> > > 0)) {
> > > +			i++;
> > > +			continue;
> > > +		}
> > > +		strlcpy(new_argv[new_argc], argv[i],
> > > sizeof(new_argv[new_argc]));
> > > +		new_argc++;
> > > +	}
> > > +
> > > +	if (eal_args) {
> > > +		strlcpy(args, eal_args, sizeof(args));
> > > +		token_nb = rte_strsplit(args, strlen(args),
> > > +					tokens, MAX_EAL_PARAM_NB, ' ');
> > > +		for (i = 0; i < token_nb; i++)
> > > +			strcpy(new_argv[new_argc++], tokens[i]);
> > > +	}
> > > +
> > > +	return new_argc;
> > > +}
> > > +
> > > +int
> > > +main(int argc, char *argv[])
> > > +{
> > > +	int ret;
> > > +	uint16_t case_nb;
> > > +	uint32_t i, nb_lcores;
> > > +	pid_t cpid, wpid;
> > > +	int wstatus;
> > > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > > +	char *pargs[MAX_EAL_PARAM_NB];
> > > +	char *cfg_path_ptr = NULL;
> > > +	char *rst_path_ptr = NULL;
> > > +	char rst_path[PATH_MAX];
> > > +	int new_argc;
> > > +	bool is_first_case = true;
> > > +
> > > +	memset(args, 0, sizeof(args));
> > > +
> > > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > > +		pargs[i] = args[i];
> > > +
> > > +	for (i = 0; i < (uint32_t)argc; i++) {
> > > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > > MAX_LONG_OPT_SZ) == 0)
> > > +			cfg_path_ptr = argv[i + 1];
> > > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > > MAX_LONG_OPT_SZ) == 0)
> > > +			rst_path_ptr = argv[i + 1];
> > > +	}
> > > +	if (cfg_path_ptr == NULL) {
> > > +		printf("Config file not assigned.\n");
> > > +		return -1;
> > > +	}
> > > +	if (rst_path_ptr == NULL) {
> > > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > > +		strcat(strtok(basename(rst_path), "."), "_result.csv");
> > > +		rst_path_ptr = rst_path;
> > > +	}
> > > +
> > > +	case_nb = load_configs(cfg_path_ptr);
> > > +	fd = fopen(rst_path_ptr, "w");
> > > +	if (fd == NULL) {
> > > +		printf("Open output CSV file error.\n");
> > > +		return -1;
> > > +	}
> > > +	fclose(fd);
> > > +
> > > +	for (i = 0; i < case_nb; i++) {
> > > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > > +			printf("No test type in test case %d.\n\n", i + 1);
> > > +			continue;
> > > +		}
> > > +		if (!test_cases[i].is_valid) {
> > > +			printf("Invalid test case %d.\n\n", i + 1);
> > > +			continue;
> > > +		}
> > > +
> > > +		cpid = fork();
> >
> > [Anoob] Do we really need fork()? Can't we use code like,
> >
> > 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> > 			ret |= rte_eal_wait_lcore(lcore_id);
> > 		}
> >
> > to wait for all threads to exit?
> 
> [Cheng] Good question. Fork() is used here to establish a new process for
> the new test case. In order for each test case to have a new EAL
> environment (for the flexibility), the EAL must be reinitialized for each case.
> However, the EAL parameters can only be initialized once per process.
> Therefore, we use a new process to run each new test case. Moreover, each
> test case runs sequentially and does not affect the others, ensuring the
> accuracy of the performance data. Your code would wait for all threads to
> exit in the same process. However, it would not provide a "clean"
> environment for each test case like fork() does. Fork() allows us to have a
> fully reinitialized environment, with no impact or side effects from previous
> test cases. This results in clean, precise performance data for each case.
> 
> Please let me know your thoughts on this. And please let me know if you
> have any other questions or require any clarification.

[Anoob] This was just a generic observation. I do not have a strong opinion either way.

> 
> Thanks,
> Cheng
> 
> >
> > > +		if (cpid < 0) {
> > > +			printf("Fork case %d failed.\n", i + 1);
> > > +			exit(EXIT_FAILURE);
> > > +		} else if (cpid == 0) {
> > > +			printf("\nRunning case %u\n\n", i + 1);
> > > +
> > > +			new_argc = append_eal_args(argc, argv,
> > > test_cases[i].eal_args, pargs);
> > > +			ret = rte_eal_init(new_argc, pargs);
> > > +			if (ret < 0)
> > > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > > arguments\n");
> > > +
> > > +			/* Check lcores. */
> > > +			nb_lcores = rte_lcore_count();
> > > +			if (nb_lcores < 2)
> > > +				rte_exit(EXIT_FAILURE,
> > > +					"There should be at least 2 worker
> > > lcores.\n");
> > > +
> > > +			fd = fopen(rst_path_ptr, "a");
> > > +			if (!fd) {
> > > +				printf("Open output CSV file error.\n");
> > > +				return 0;
> > > +			}
> > > +
> > > +			if (is_first_case) {
> > > +				output_env_info();
> > > +				is_first_case = false;
> > > +			}
> > > +			run_test(i + 1, &test_cases[i]);
> > > +
> > > +			/* clean up the EAL */
> > > +			rte_eal_cleanup();
> > > +
> > > +			fclose(fd);
> > > +
> > > +			printf("\nCase %u completed.\n\n", i + 1);
> > > +
> > > +			exit(EXIT_SUCCESS);
> > > +		} else {
> > > +			wpid = waitpid(cpid, &wstatus, 0);
> > > +			if (wpid == -1) {
> > > +				printf("waitpid error.\n");
> > > +				exit(EXIT_FAILURE);
> > > +			}
> > > +
> > > +			if (WIFEXITED(wstatus))
> > > +				printf("Case process exited. status %d\n\n",
> > > +					WEXITSTATUS(wstatus));
> > > +			else if (WIFSIGNALED(wstatus))
> > > +				printf("Case process killed by signal %d\n\n",
> > > +					WTERMSIG(wstatus));
> > > +			else if (WIFSTOPPED(wstatus))
> > > +				printf("Case process stopped by signal
> > > %d\n\n",
> > > +					WSTOPSIG(wstatus));
> > > +			else if (WIFCONTINUED(wstatus))
> > > +				printf("Case process continued.\n\n");
> > > +			else
> > > +				printf("Case process unknown
> > > terminated.\n\n");
> > > +		}
> > > +	}
> > > +
> > > +	printf("Bye...\n");
> > > +	return 0;
> > > +}
> > > +
> > > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new
> > > file mode 100644 index 0000000000..215ac42673
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/main.h
> > > @@ -0,0 +1,69 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright(c) 2023 Intel Corporation  */
> > > +
> > > +#ifndef _MAIN_H_
> > > +#define _MAIN_H_
> > > +
> > > +
> > > +#include <rte_common.h>
> > > +#include <rte_cycles.h>
> > > +#include <rte_dev.h>
> > > +#include <rte_dmadev.h>
> > > +
> > > +#ifndef __maybe_unused
> > > +#define __maybe_unused	__rte_unused
> > > +#endif
> > > +
> > > +#define MAX_WORKER_NB 128
> > > +#define MAX_OUTPUT_STR_LEN 512
> > > +
> > > +#define MAX_DMA_NB 128
> > > +#define MAX_LCORE_NB 256
> > > +
> > > +extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > +
> > > +typedef enum {
> > > +	OP_NONE = 0,
> > > +	OP_ADD,
> > > +	OP_MUL
> > > +} alg_op_type;
> > > +
> > > +struct test_configure_entry {
> > > +	uint32_t first;
> > > +	uint32_t last;
> > > +	uint32_t incr;
> > > +	alg_op_type op;
> > > +	uint32_t cur;
> > > +};
> > > +
> > > +struct lcore_dma_map_t {
> > > +	uint32_t lcores[MAX_WORKER_NB];
> > > +	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > > +	int16_t dma_ids[MAX_WORKER_NB];
> > > +	uint16_t cnt;
> > > +};
> > > +
> > > +struct test_configure {
> > > +	bool is_valid;
> > > +	uint8_t test_type;
> > > +	const char *test_type_str;
> > > +	uint16_t src_numa_node;
> > > +	uint16_t dst_numa_node;
> > > +	uint16_t opcode;
> > > +	bool is_dma;
> > > +	struct lcore_dma_map_t lcore_dma_map;
> > > +	struct test_configure_entry mem_size;
> > > +	struct test_configure_entry buf_size;
> > > +	struct test_configure_entry ring_size;
> > > +	struct test_configure_entry kick_batch;
> > > +	uint32_t cache_flush;
> > > +	uint32_t nr_buf;
> > > +	uint16_t test_secs;
> > > +	const char *eal_args;
> > > +	uint8_t scenario_id;
> > > +};
> > > +
> > > +void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> > > +
> > > +#endif /* _MAIN_H_ */
> > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/meson.build
> > > @@ -0,0 +1,17 @@
> > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > +Intel Corporation
> > > +
> > > +# meson file, for building this app as part of a main DPDK build.
> > > +
> > > +if is_windows
> > > +    build = false
> > > +    reason = 'not supported on Windows'
> > > +    subdir_done()
> > > +endif
> > > +
> > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > +
> > > +sources = files(
> > > +        'main.c',
> > > +        'benchmark.c',
> > > +)
> > > --
> > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-15  8:44       ` Anoob Joseph
@ 2023-06-15 14:05         ` Jiang, Cheng1
  2023-06-15 15:47           ` Anoob Joseph
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-15 14:05 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Thursday, June 15, 2023 4:45 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Please see inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Thursday, June 15, 2023 1:31 PM
> > To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> > Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> > Prakash Shukla <amitprakashs@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi,
> >
> > Thanks for your comments, the replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Thursday, June 15, 2023 1:22 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> Amit
> > > Prakash Shukla <amitprakashs@marvell.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> > YuanX
> > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi,
> > >
> > > Thanks for working on the comments. Few more top level comment
> inline.
> > >
> > > Thanks,
> > > Anoob
> > >
> > > > -----Original Message-----
> > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Sent: Tuesday, June 13, 2023 10:02 AM
> > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash
> > Shukla
> > > > <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>
> > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > xingguang.he@intel.com;
> > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > External Email
> > > >
> > > > ------------------------------------------------------------------
> > > > --
> > > > -- There are many high-performance DMA devices supported in DPDK
> > > > now, and these DMA devices can also be integrated into other
> > > > modules of DPDK as accelerators, such as Vhost. Before integrating
> > > > DMA into applications, developers need to know the performance of
> > > > these DMA devices in various scenarios and the performance of CPUs
> > > > in the same scenario, such as different buffer lengths. Only in
> > > > this way can we know the target performance of the application
> > > > accelerated by using them. This patch introduces a
> > > > high-performance testing tool, which supports comparing the
> > > > performance of CPU and DMA in different scenarios automatically
> > > > with a pre- set config file. Memory Copy performance test are
> > > supported for now.
> > > >
> > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > ---
> > > > v6:
> > > >   improved code based on Anoob's comments;
> > > >   fixed some code structure issues;
> > > > v5:
> > > >   fixed some LONG_LINE warnings;
> > > > v4:
> > > >   fixed inaccuracy of the memory footprint display;
> > > > v3:
> > > >   fixed some typos;
> > > > v2:
> > > >   added lcore/dmadev designation;
> > > >   added error case process;
> > > >   removed worker_threads parameter from config.ini;
> > > >   improved the logs;
> > > >   improved config file;
> > > >
> > > >  app/meson.build               |   1 +
> > > >  app/test-dma-perf/benchmark.c | 477
> > ++++++++++++++++++++++++++++
> > > > app/test-dma-perf/config.ini  |  59 ++++
> > > >  app/test-dma-perf/main.c      | 569
> > > > ++++++++++++++++++++++++++++++++++
> > > >  app/test-dma-perf/main.h      |  69 +++++
> > > >  app/test-dma-perf/meson.build |  17 +
> > > >  6 files changed, 1192 insertions(+)  create mode 100644
> > > > app/test-dma-perf/benchmark.c  create mode 100644
> > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > > mode
> > > > 100644 app/test-dma-perf/meson.build
> > > >
> > > > diff --git a/app/meson.build b/app/meson.build index
> > > > 74d2420f67..4fc1a83eba 100644
> > > > --- a/app/meson.build
> > > > +++ b/app/meson.build
> > > > @@ -19,6 +19,7 @@ apps = [
> > > >          'test-cmdline',
> > > >          'test-compress-perf',
> > > >          'test-crypto-perf',
> > > > +        'test-dma-perf',
> > > >          'test-eventdev',
> > > >          'test-fib',
> > > >          'test-flow-perf',
> > > > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > > > perf/benchmark.c new file mode 100644 index 0000000000..bc1ca82297
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/benchmark.c
> > > > @@ -0,0 +1,477 @@
> > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > +
> > > > +#include <inttypes.h>
> > > > +#include <stdio.h>
> > > > +#include <stdlib.h>
> > > > +#include <unistd.h>
> > > > +
> > > > +#include <rte_time.h>
> > > > +#include <rte_mbuf.h>
> > > > +#include <rte_dmadev.h>
> > > > +#include <rte_malloc.h>
> > > > +#include <rte_lcore.h>
> > > > +
> > > > +#include "main.h"
> > > > +
> > > > +#define MAX_DMA_CPL_NB 255
> > > > +
> > > > +#define TEST_WAIT_U_SECOND 10000
> > > > +
> > > > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%"
> > PRIu64
> > > > ",%.3lf,%.3lf\n"
> > > > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%"
> > PRIu64
> > > > ",%.3lf,%.3lf\n"
> > > > +
> > > > +struct worker_info {
> > > > +	bool ready_flag;
> > > > +	bool start_flag;
> > > > +	bool stop_flag;
> > > > +	uint32_t total_cpl;
> > > > +	uint32_t test_cpl;
> > > > +};
> > > > +
> > > > +struct lcore_params {
> > > > +	uint8_t scenario_id;
> > > > +	unsigned int lcore_id;
> > > > +	char *dma_name;
> > > > +	uint16_t worker_id;
> > > > +	uint16_t dev_id;
> > > > +	uint32_t nr_buf;
> > > > +	uint16_t kick_batch;
> > > > +	uint32_t buf_size;
> > > > +	uint16_t test_secs;
> > > > +	struct rte_mbuf **srcs;
> > > > +	struct rte_mbuf **dsts;
> > > > +	struct worker_info worker_info;
> > > > +};
> > > > +
> > > > +static struct rte_mempool *src_pool; static struct rte_mempool
> > > > +*dst_pool;
> > > > +
> > > > +static volatile struct lcore_params
> > *worker_params[MAX_WORKER_NB];
> > > > +
> > > > +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> > > > +
> > > > +static inline int
> > > > +__rte_format_printf(3, 4)
> > > > +print_err(const char *func, int lineno, const char *format, ...) {
> > > > +	va_list ap;
> > > > +	int ret;
> > > > +
> > > > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > > > +	va_start(ap, format);
> > > > +	ret += vfprintf(stderr, format, ap);
> > > > +	va_end(ap);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static inline void
> > > > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t
> > > > +nb_workers,
> > > > uint16_t test_secs,
> > > > +				uint32_t total_cnt, float *memory, uint32_t
> > > > *ave_cycle,
> > > > +				float *bandwidth, float *mops) {
> > > > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024
> > > > +*
> > > > 1024);
> > > > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > > > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > > > (float)*ave_cycle)) / 1000000000;
> > > > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > > > +
> > > > +static void
> > > > +output_result(uint8_t scenario_id, uint32_t lcore_id, char
> > > > +*dma_name,
> > > > uint64_t ave_cycle,
> > > > +			uint32_t buf_size, uint32_t nr_buf, float memory,
> > > > +			float bandwidth, float mops, bool is_dma) {
> > > > +	if (is_dma)
> > > > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > > > +	else
> > > > +		printf("lcore %u\n", lcore_id);
> > > > +
> > > > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf:
> > > > +%u,
> > > > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > > > +			ave_cycle, buf_size, nr_buf, memory,
> > > > rte_get_timer_hz());
> > > > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth,
> > > > +mops);
> > > > +
> > > > +	if (is_dma)
> > > > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > > > CSV_LINE_DMA_FMT,
> > > > +			scenario_id, lcore_id, dma_name, buf_size,
> > > > +			nr_buf, memory, ave_cycle, bandwidth, mops);
> > > > +	else
> > > > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > > > CSV_LINE_CPU_FMT,
> > > > +			scenario_id, lcore_id, buf_size,
> > > > +			nr_buf, memory, ave_cycle, bandwidth, mops); }
> > > > +
> > > > +static inline void
> > > > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > > > +		__maybe_unused uint32_t buf_size,
> > > > +		__maybe_unused uint32_t nr_buf) { #ifdef
> RTE_ARCH_X86_64
> > > > +	char *data;
> > > > +	struct rte_mbuf **srcs = array;
> > > > +	uint32_t i, offset;
> > > > +
> > > > +	for (i = 0; i < nr_buf; i++) {
> > > > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > > > +		for (offset = 0; offset < buf_size; offset += 64)
> > > > +			__builtin_ia32_clflush(data + offset);
> > > > +	}
> > > > +#endif
> > > > +}
> > > > +
> > > > +/* Configuration of device. */
> > > > +static void
> > > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > > > +	uint16_t vchan = 0;
> > > > +	struct rte_dma_info info;
> > > > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > > +	struct rte_dma_vchan_conf qconf = {
> > > > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > > +		.nb_desc = ring_size
> > > > +	};
> > > > +
> > > > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > > > +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> > > > +
> > > > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > > > +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> > > > +
> > > > +	rte_dma_info_get(dev_id, &info);
> > > > +	if (info.nb_vchans != 1)
> > > > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > > > reported on device id. %u\n",
> > > > +				dev_id);
> > > > +
> > > > +	if (rte_dma_start(dev_id) != 0)
> > > > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > > > +
> > > > +static int
> > > > +config_dmadevs(struct test_configure *cfg) {
> > > > +	uint32_t ring_size = cfg->ring_size.cur;
> > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > +	uint32_t nb_workers = ldm->cnt;
> > > > +	uint32_t i;
> > > > +	int dev_id;
> > > > +	uint16_t nb_dmadevs = 0;
> > > > +	char *dma_name;
> > > > +
> > > > +	for (i = 0; i < ldm->cnt; i++) {
> > > > +		dma_name = ldm->dma_names[i];
> > > > +		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> > > > +		if (dev_id == -1) {
> > > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > > dma_name);
> > > > +			goto end;
> > > > +		}
> > > > +
> > > > +		ldm->dma_ids[i] = dev_id;
> > > > +		configure_dmadev_queue(dev_id, ring_size);
> > > > +		++nb_dmadevs;
> > > > +	}
> > > > +
> > > > +end:
> > > > +	if (nb_dmadevs < nb_workers) {
> > > > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> > > > nb_dmadevs, nb_workers);
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +#define POLL_MAX 1000
> > > > +
> > > > +
> > >
> > > [Anoob] Extra blank line. You can consider removing.
> >
> > [Cheng] sure, sorry for the miss.
> >
> > >
> > > > +static inline void
> > > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > > > +			volatile struct worker_info *worker_info) {
> > > > +	int ret;
> > > > +	uint16_t nr_cpl;
> > > > +
> > > > +	ret = rte_dma_submit(dev_id, 0);
> > > > +	if (ret < 0) {
> > > > +		rte_dma_stop(dev_id);
> > > > +		rte_dma_close(dev_id);
> > > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > > +	}
> > > > +
> > > > +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> > > > NULL);
> > > > +	*async_cnt -= nr_cpl;
> > > > +	worker_info->total_cpl += nr_cpl; }
> > > > +
> > > > +static inline int
> > > > +do_dma_mem_copy(void *p)
> > > > +{
> > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > > +	const uint16_t dev_id = para->dev_id;
> > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > +	const uint16_t kick_batch = para->kick_batch;
> > > > +	const uint32_t buf_size = para->buf_size;
> > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > +	uint16_t nr_cpl;
> > > > +	uint64_t async_cnt = 0;
> > > > +	uint32_t i;
> > > > +	uint32_t poll_cnt = 0;
> > > > +	int ret;
> > > > +
> > > > +	worker_info->stop_flag = false;
> > > > +	worker_info->ready_flag = true;
> > > > +
> > > > +	while (!worker_info->start_flag)
> > > > +		;
> > > > +
> > > > +	while (1) {
> > > > +		for (i = 0; i < nr_buf; i++) {
> > > > +dma_copy:
> > > > +			ret = rte_dma_copy(dev_id, 0,
> > > > rte_pktmbuf_iova(srcs[i]),
> > > > +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> > > > +			if (unlikely(ret < 0)) {
> > > > +				if (ret == -ENOSPC) {
> > > > +					do_dma_submit_and_poll(dev_id,
> > > > &async_cnt, worker_info);
> > > > +					goto dma_copy;
> > > > +				} else {
> > > > +					/* Error exit */
> > > > +					rte_dma_stop(dev_id);
> > > > +					rte_exit(EXIT_FAILURE, "DMA
> > > > enqueue failed\n");
> > > > +				}
> > > > +			}
> > > > +			async_cnt++;
> > > > +
> > > > +			if ((async_cnt % kick_batch) == 0)
> > > > +				do_dma_submit_and_poll(dev_id,
> > > > &async_cnt, worker_info);
> > > > +		}
> > > > +
> > > > +		if (worker_info->stop_flag)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	rte_dma_submit(dev_id, 0);
> > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +		async_cnt -= nr_cpl;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static inline int
> > > > +do_cpu_mem_copy(void *p)
> > > > +{
> > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > +	const uint32_t buf_size = para->buf_size;
> > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > +	uint32_t i;
> > > > +
> > > > +	worker_info->stop_flag = false;
> > > > +	worker_info->ready_flag = true;
> > > > +
> > > > +	while (!worker_info->start_flag)
> > > > +		;
> > > > +
> > > > +	while (1) {
> > > > +		for (i = 0; i < nr_buf; i++) {
> > > > +			/* copy buffer form src to dst */
> > > > +			rte_memcpy((void
> > > > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > > > +				(void
> > > > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > > > +				(size_t)buf_size);
> > > > +			worker_info->total_cpl++;
> > > > +		}
> > > > +		if (worker_info->stop_flag)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int
> > > > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf
> > ***srcs,
> > > > +			struct rte_mbuf ***dsts)
> > > > +{
> > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > +	unsigned int nr_sockets;
> > > > +	uint32_t nr_buf = cfg->nr_buf;
> > > > +
> > > > +	nr_sockets = rte_socket_count();
> > > > +	if (cfg->src_numa_node >= nr_sockets ||
> > > > +		cfg->dst_numa_node >= nr_sockets) {
> > > > +		printf("Error: Source or destination numa exceeds the acture
> > > > numa nodes.\n");
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > > > +			nr_buf, /* n == num elements */
> > > > +			64,  /* cache size */
> > > > +			0,   /* priv size */
> > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > +			cfg->src_numa_node);
> > > > +	if (src_pool == NULL) {
> > > > +		PRINT_ERR("Error with source mempool creation.\n");
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > > > +			nr_buf, /* n == num elements */
> > > > +			64,  /* cache size */
> > >
> > > [Anoob] We do not alloc or free pointers in the datapath, right? So
> > > why bother with cache?
> >
> > [Cheng] Yes, you are right, the cache size is not necessary here, I'll
> > fix it in the next version.
> >
> > >
> > > > +			0,   /* priv size */
> > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > +			cfg->dst_numa_node);
> > > > +	if (dst_pool == NULL) {
> > > > +		PRINT_ERR("Error with destination mempool creation.\n");
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > > > +	if (*srcs == NULL) {
> > > > +		printf("Error: srcs malloc failed.\n");
> > > > +		return -1;
> > > > +	}
> > >
> > > [Anoob] Are we freeing these memory? The ones allocated with
> > rte_malloc.
> >
> > [Cheng] yes, we freed the memory in the end of mem_copy_benchmark()
> > when we finished the test.
> 
> [Anoob] I think we are not freeing this mem. In the place where we free all
> mem, we do free all objects to mempool as well as the mempools. But this
> memory is to hold the pointers, right? Is that getting freed anywhere?
> 
> Also, in the mem clearing paths, do we need to clear the static variables (ie,
> set srcs, src_pool, dsts, dst_pool to NULL) so that there won't be any scope
> for any double free.
> 

[Cheng] My apologies for the misunderstanding earlier. I now understand your point that you are right, the memory used to store the pointers is not being freed. I will fix this issue in the next version. Regarding the static variables you mentioned, I agree with your view that they should be cleared. I will address this in the upcoming version as well. Thank you very much for the feedback. It is greatly appreciated.

In addition, I think we also need to nullify these variables when initializing them to ensure safety and standardization of use. What do you think?

Thanks!

> >
> > >
> > > > +
> > > > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > > > +	if (*dsts == NULL) {
> > > > +		printf("Error: dsts malloc failed.\n");
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
> > > > +		printf("get src mbufs failed.\n");
> > > > +		return -1;
> > > > +	}
> > > > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
> > > > +		printf("get dst mbufs failed.\n");
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +void
> > > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > > +	uint16_t i;
> > > > +	uint32_t offset;
> > > > +	unsigned int lcore_id = 0;
> > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > +1024) /
> > > > (cfg->buf_size.cur * 2);
> > > > +	uint16_t nb_workers = ldm->cnt;
> > > > +	uint16_t test_secs = cfg->test_secs;
> > > > +	float memory;
> > > > +	uint32_t avg_cycles = 0;
> > > > +	float mops;
> > > > +	float bandwidth;
> > > > +
> > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > +		goto out;
> > > > +
> > > > +	if (is_dma)
> > > > +		if (config_dmadevs(cfg) < 0)
> > > > +			goto out;
> > > > +
> > > > +	if (cfg->cache_flush) {
> > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > +		rte_mb();
> > > > +	}
> > > > +
> > > > +	printf("Start testing....\n");
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++) {
> > > > +		lcore_id = ldm->lcores[i];
> > > > +		offset = nr_buf / nb_workers * i;
> > > > +
> > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > lcore_params), 0);
> > > > +		if (!worker_params[i]) {
> > > > +			printf("lcore parameters malloc failure for lcore
> > > > %d\n", lcore_id);
> > > > +			break;
> > > > +		}
> > >
> > > [Anoob] Are we freeing the above memory?
> >
> > [Cheng] sorry, I missed that, I'll add worker_params memory free in
> > the next version, thanks.
> >
> > >
> > > > +		if (is_dma) {
> > > > +			worker_params[i]->dma_name = ldm-
> > > > >dma_names[i];
> > > > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > +		}
> > > > +		worker_params[i]->worker_id = i;
> > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > nb_workers);
> > > > +		worker_params[i]->buf_size = buf_size;
> > > > +		worker_params[i]->test_secs = test_secs;
> > > > +		worker_params[i]->srcs = srcs + offset;
> > > > +		worker_params[i]->dsts = dsts + offset;
> > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > +
> > > > +		if (is_dma)
> > > > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > *)(&i), lcore_id);
> > > > +		else
> > > > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > > > *)(&i), lcore_id);
> > > > +	}
> > > > +
> > > > +	while (1) {
> > > > +		bool ready = true;
> > > > +		for (i = 0; i < nb_workers; i++) {
> > > > +			if (worker_params[i]->worker_info.ready_flag ==
> > > > false) {
> > > > +				ready = 0;
> > > > +				break;
> > > > +			}
> > > > +		}
> > > > +		if (ready)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > +
> > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.test_cpl =
> > > > +worker_params[i]->worker_info.total_cpl;
> > > > +
> > > > +	usleep(test_secs * 1000 * 1000);
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.test_cpl =
> > > > worker_params[i]->worker_info.total_cpl -
> > > > +						worker_params[i]-
> > > > >worker_info.test_cpl;
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > +
> > > > +	rte_eal_mp_wait_lcore();
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++) {
> > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > +			worker_params[i]->worker_info.test_cpl,
> > > > +			&memory, &avg_cycles, &bandwidth, &mops);
> > > > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > > > +					worker_params[i]->dma_name,
> > > > avg_cycles, buf_size,
> > > > +					nr_buf / nb_workers, memory,
> > > > bandwidth, mops, is_dma);
> > > > +	}
> > > > +
> > > > +out:
> > > > +	/* free env */
> > > > +	if (srcs)
> > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > +	if (dsts)
> > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > +
> > > > +	if (src_pool)
> > > > +		rte_mempool_free(src_pool);
> > > > +	if (dst_pool)
> > > > +		rte_mempool_free(dst_pool);
> > > > +
> > > > +	if (is_dma) {
> > > > +		for (i = 0; i < nb_workers; i++) {
> > > > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > > > +		}
> > > > +	}
> > > > +}
> > > > diff --git a/app/test-dma-perf/config.ini
> > > > b/app/test-dma-perf/config.ini new file mode 100644 index
> > > > 0000000000..2fd9c3c387
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/config.ini
> > > > @@ -0,0 +1,59 @@
> > > > +
> > > > +; This is an example configuration file for dma-perf, which
> > > > +details the meanings of each parameter ; and instructions on how
> > > > +to use dma-
> > perf.
> > > > +
> > > > +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
> > > > +
> > > > +; Parameters:
> > > > +; "mem_size" denotes the size of the memory footprint.
> > > > +; "buf_size" denotes the memory size of a single operation.
> > > > +; "dma_ring_size" denotes the dma ring buffer size. It should be
> > > > +greater
> > > > than 64 normally.
> > > > +; "kick_batch" denotes the dma operation batch size, and should
> > > > +be greater
> > > > than 1 normally.
> > > > +
> > > > +; The format for variables is variable=first,last,increment,ADD|MUL.
> > > > +
> > > > +; src_numa_node is used to control the numa node where the source
> > > > memory is allocated.
> > > > +; dst_numa_node is used to control the numa node where the
> > > > +destination
> > > > memory is allocated.
> > > > +
> > > > +; cache_flush is used to determine whether or not the cache
> > > > +should be flushed, with 1 indicating to ; flush and 0 indicating to not
> flush.
> > > > +
> > > > +; test_seconds controls the test time of the whole case.
> > > > +
> > > > +; To use DMA for a test, please specify the "lcore_dma" parameter.
> > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > +; make sure that the value of "lcore_dma" falls within their
> > > > +range of the
> > > > values.
> > > > +
> > > > +; To use CPU for a test, please specify the "lcore" parameter.
> > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > +; make sure that the value of "lcore" falls within their range of values.
> > > > +
> > > > +; To specify a configuration file, use the "--config" flag
> > > > +followed by the path
> > > > to the file.
> > > > +
> > > > +; To specify a result file, use the "--result" flag followed by
> > > > +the path to the
> > > > file.
> > > > +; If you do not specify a result file, one will be generated with
> > > > +the same name as the configuration ; file, with the addition of
> > > > +"_result.csv" at
> > > > the end.
> > > > +
> > > > +[case1]
> > > > +type=DMA_MEM_COPY
> > > > +mem_size=10
> > > > +buf_size=64,8192,2,MUL
> > > > +dma_ring_size=1024
> > > > +kick_batch=32
> > > > +src_numa_node=0
> > > > +dst_numa_node=0
> > > > +cache_flush=0
> > > > +test_seconds=2
> > > > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > >
> > > [Anoob] Isn't it better if we allow user to specify DMA dev ID
> > > rather than the PCI DBDF?
> > >
> > > In the long run, I would expect config file to provide {core,
> > > dma_dev_id, queue_id}
> > >
> > > Another thought is why to expose this at all? If we can restrict
> > > this perf application to have one thread only use one vchan, then
> > > application can easily create this mapping in run time. Unless you
> > > want one thread to use 2 different vchans which may not be desirable
> > since this is a standalone perf app.
> >
> > [Cheng] Thank you for the feedback.
> > Here are my thoughts:
> > Firstly, the user may not know which device the DMA dev ID corresponds
> > to, or which NUMA node it is on. In my example, I used the CBDMA
> > environment, so I did not specify the work queue ID. When using DSA,
> > the configuration would be something like lcore10@0000:00:04.2-q0
> > which contains core, dma and work queue id. The reason for exposing
> > these options is that we want the user to fully understand which cores
> > and devices are being used so that they know exactly where the
> > performance data is coming from. For example, performance when cores
> > and DMA devices are not on the same NUMA node, etc. This allows the
> > testing scenario to be precise and flexible. If the application
> > handles the mapping itself, the user loses control over the mapping
> > and may not get the performance data they want. We believe control
> > should be given to the user rather than the application.
> 
> [Anoob] I understand your view points. Thanks for the explanation.
> 

[Cheng] sure, no problem.

> >
> > >
> > > > +eal_args=--in-memory --file-prefix=test
> > > > +
> > > > +[case2]
> > > > +type=CPU_MEM_COPY
> > > > +mem_size=10
> > > > +buf_size=64,8192,2,MUL
> > > > +src_numa_node=0
> > > > +dst_numa_node=1
> > > > +cache_flush=0
> > > > +test_seconds=2
> > > > +lcore = 3, 4
> > > > +eal_args=--in-memory --no-pci
> > > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > > > new file mode 100644 index 0000000000..d65655b87b
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/main.c
> > > > @@ -0,0 +1,569 @@
> > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > +
> > > > +#include <stdio.h>
> > > > +#include <stdlib.h>
> > > > +#include <getopt.h>
> > > > +#include <signal.h>
> > > > +#include <stdbool.h>
> > > > +#include <unistd.h>
> > > > +#include <sys/wait.h>
> > > > +#include <inttypes.h>
> > > > +#include <libgen.h>
> > > > +
> > > > +#include <rte_eal.h>
> > > > +#include <rte_cfgfile.h>
> > > > +#include <rte_string_fns.h>
> > > > +#include <rte_lcore.h>
> > > > +
> > > > +#include "main.h"
> > > > +
> > > > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > > > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > > > +
> > > > +#define MAX_EAL_PARAM_NB 100
> > > > +#define MAX_EAL_PARAM_LEN 1024
> > > > +
> > > > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > > > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > > > +
> > > > +#define CMDLINE_CONFIG_ARG "--config"
> > > > +#define CMDLINE_RESULT_ARG "--result"
> > > > +
> > > > +#define MAX_PARAMS_PER_ENTRY 4
> > > > +
> > > > +#define MAX_LONG_OPT_SZ 64
> > > > +
> > > > +enum {
> > > > +	TEST_TYPE_NONE = 0,
> > > > +	TEST_TYPE_DMA_MEM_COPY,
> > > > +	TEST_TYPE_CPU_MEM_COPY
> > > > +};
> > > > +
> > > > +#define MAX_TEST_CASES 16
> > > > +static struct test_configure test_cases[MAX_TEST_CASES];
> > > > +
> > > > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > +
> > > > +static FILE *fd;
> > > > +
> > > > +static void
> > > > +output_csv(bool need_blankline)
> > > > +{
> > > > +	uint32_t i;
> > > > +
> > > > +	if (need_blankline) {
> > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > +	}
> > > > +
> > > > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > > > +		if (output_str[i][0]) {
> > > > +			fprintf(fd, "%s", output_str[i]);
> > > > +			output_str[i][0] = '\0';
> > > > +		}
> > > > +	}
> > > > +
> > > > +	fflush(fd);
> > > > +}
> > > > +
> > > > +static void
> > > > +output_env_info(void)
> > > > +{
> > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > > > environment:\n");
> > > > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
> > > > +			PRIu64 "\n", rte_get_timer_hz());
> > > > +
> > > > +	output_csv(true);
> > > > +}
> > > > +
> > > > +static void
> > > > +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > > > +			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
> > > > +
> > > > +	output_csv(true);
> > > > +}
> > > > +
> > > > +static void
> > > > +run_test_case(struct test_configure *case_cfg) {
> > > > +	switch (case_cfg->test_type) {
> > > > +	case TEST_TYPE_DMA_MEM_COPY:
> > > > +		mem_copy_benchmark(case_cfg, true);
> > > > +		break;
> > > > +	case TEST_TYPE_CPU_MEM_COPY:
> > > > +		mem_copy_benchmark(case_cfg, false);
> > > > +		break;
> > > > +	default:
> > > > +		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> > > > +		break;
> > > > +	}
> > > > +}
> > > > +
> > > > +static void
> > > > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > > > +	uint32_t i;
> > > > +	uint32_t nb_lcores = rte_lcore_count();
> > > > +	struct test_configure_entry *mem_size = &case_cfg->mem_size;
> > > > +	struct test_configure_entry *buf_size = &case_cfg->buf_size;
> > > > +	struct test_configure_entry *ring_size = &case_cfg->ring_size;
> > > > +	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
> > > > +	struct test_configure_entry dummy = { 0 };
> > > > +	struct test_configure_entry *var_entry = &dummy;
> > > > +
> > > > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > > > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > > > +
> > > > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > > > +		printf("Case %u: Not enough lcores.\n", case_id);
> > > > +		return;
> > > > +	}
> > > > +
> > > > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > > > +
> > > > +	if (mem_size->incr != 0)
> > > > +		var_entry = mem_size;
> > > > +
> > > > +	if (buf_size->incr != 0)
> > > > +		var_entry = buf_size;
> > > > +
> > > > +	if (ring_size->incr != 0)
> > > > +		var_entry = ring_size;
> > > > +
> > > > +	if (kick_batch->incr != 0)
> > > > +		var_entry = kick_batch;
> > > > +
> > > > +	case_cfg->scenario_id = 0;
> > > > +
> > > > +	output_header(case_id, case_cfg);
> > > > +
> > > > +	for (var_entry->cur = var_entry->first; var_entry->cur <=
> > > > +var_entry-
> > > > >last;) {
> > > > +		case_cfg->scenario_id++;
> > > > +		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
> > > > +
> > > > +		run_test_case(case_cfg);
> > > > +		output_csv(false);
> > > > +
> > > > +		if (var_entry->op == OP_ADD)
> > > > +			var_entry->cur += var_entry->incr;
> > > > +		else if (var_entry->op == OP_MUL)
> > > > +			var_entry->cur *= var_entry->incr;
> > > > +		else
> > > > +			break;
> > > > +	}
> > > > +}
> > > > +
> > > > +static int
> > > > +parse_lcore(struct test_configure *test_case, const char *value) {
> > > > +	size_t len = strlen(value);
> > > > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > > > +	strcpy(input, value);
> > > > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > > > >lcore_dma_map);
> > > > +
> > > > +	if (test_case == NULL || value == NULL)
> > > > +		return -1;
> > > > +
> > > > +	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
> > > > +
> > > > +	char *token = strtok(input, ", ");
> > > > +	while (token != NULL) {
> > > > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > > > +			free(input);
> > > > +			return -1;
> > > > +		}
> > > > +
> > > > +		uint16_t lcore_id = atoi(token);
> > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
> > > > +
> > > > +		token = strtok(NULL, ", ");
> > > > +	}
> > > > +
> > > > +	free(input);
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int
> > > > +parse_lcore_dma(struct test_configure *test_case, const char *value)
> {
> > > > +	struct lcore_dma_map_t *lcore_dma_map;
> > > > +	char *input = strndup(value, strlen(value) + 1);
> > > > +	char *addrs = input;
> > > > +	char *ptrs[2];
> > > > +	char *start, *end, *substr;
> > > > +	uint16_t lcore_id;
> > > > +	int ret = 0;
> > > > +
> > > > +	while (*addrs == '\0')
> > > > +		addrs++;
> > > > +	if (*addrs == '\0') {
> > > > +		fprintf(stderr, "No input DMA addresses\n");
> > > > +		ret = -1;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	substr = strtok(addrs, ",");
> > > > +	if (substr == NULL) {
> > > > +		fprintf(stderr, "No input DMA address\n");
> > > > +		ret = -1;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > > > lcore_dma_map_t));
> > > > +
> > > > +	do {
> > > > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > > > +
> > > > +		start = strstr(ptrs[0], "lcore");
> > > > +		if (start == NULL) {
> > > > +			fprintf(stderr, "Illegal lcore\n");
> > > > +			ret = -1;
> > > > +			break;
> > > > +		}
> > > > +
> > > > +		start += 5;
> > > > +		lcore_id = strtol(start, &end, 0);
> > > > +		if (end == start) {
> > > > +			fprintf(stderr, "No input lcore ID or ID %d is
> > > > wrong\n", lcore_id);
> > > > +			ret = -1;
> > > > +			break;
> > > > +		}
> > > > +
> > > > +		lcore_dma_map = &test_case->lcore_dma_map;
> > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
> > > > +		strcpy(lcore_dma_map->dma_names[lcore_dma_map-
> > > > >cnt], ptrs[1]);
> > > > +		lcore_dma_map->cnt++;
> > > > +		substr = strtok(NULL, ",");
> > > > +	} while (substr != NULL);
> > > > +
> > > > +out:
> > > > +	free(input);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int
> > > > +parse_entry(const char *value, struct test_configure_entry *entry) {
> > > > +	char input[255] = {0};
> > > > +	char *args[MAX_PARAMS_PER_ENTRY];
> > > > +	int args_nr = -1;
> > > > +
> > > > +	if (value == NULL || entry == NULL)
> > > > +		goto out;
> > > > +
> > > > +	strncpy(input, value, 254);
> > > > +	if (*input == '\0')
> > > > +		goto out;
> > > > +
> > > > +	args_nr = rte_strsplit(input, strlen(input), args,
> > > > MAX_PARAMS_PER_ENTRY, ',');
> > > > +	if (args_nr != 1 && args_nr != 4)
> > > > +		goto out;
> > > > +
> > > > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > > > +
> > > > +	if (args_nr == 4) {
> > > > +		entry->last = (uint32_t)atoi(args[1]);
> > > > +		entry->incr = (uint32_t)atoi(args[2]);
> > > > +		if (!strcmp(args[3], "MUL"))
> > > > +			entry->op = OP_MUL;
> > > > +		else if (!strcmp(args[3], "ADD"))
> > > > +			entry->op = OP_ADD;
> > > > +		else {
> > > > +			printf("Invalid op %s.\n", args[3]);
> > > > +			args_nr = -1;
> > > > +		}
> > > > +	} else {
> > > > +		entry->op = OP_NONE;
> > > > +		entry->last = 0;
> > > > +		entry->incr = 0;
> > > > +	}
> > > > +out:
> > > > +	return args_nr;
> > > > +}
> > > > +
> > > > +static uint16_t
> > > > +load_configs(const char *path)
> > > > +{
> > > > +	struct rte_cfgfile *cfgfile;
> > > > +	int nb_sections, i;
> > > > +	struct test_configure *test_case;
> > > > +	char section_name[CFG_NAME_LEN];
> > > > +	const char *case_type;
> > > > +	const char *lcore_dma;
> > > > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > > > *kick_batch_str;
> > > > +	int args_nr, nb_vp;
> > > > +	bool is_dma;
> > > > +
> > > > +	printf("config file parsing...\n");
> > > > +	cfgfile = rte_cfgfile_load(path, 0);
> > > > +	if (!cfgfile) {
> > > > +		printf("Open configure file error.\n");
> > > > +		exit(1);
> > > > +	}
> > > > +
> > > > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > > > +	if (nb_sections > MAX_TEST_CASES) {
> > > > +		printf("Error: The maximum number of cases is %d.\n",
> > > > MAX_TEST_CASES);
> > > > +		exit(1);
> > > > +	}
> > > > +
> > > > +	for (i = 0; i < nb_sections; i++) {
> > > > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
> > > > +		test_case = &test_cases[i];
> > > > +		case_type = rte_cfgfile_get_entry(cfgfile, section_name,
> > > > "type");
> > > > +		if (!case_type) {
> > > > +			printf("Error: No case type in case %d, the test will be
> > > > finished here.\n",
> > > > +				i + 1);
> > > > +			test_case->is_valid = false;
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > > > +			test_case->test_type =
> > > > TEST_TYPE_DMA_MEM_COPY;
> > > > +			test_case->test_type_str = DMA_MEM_COPY;
> > > > +			is_dma = true;
> > > > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > > > +			test_case->test_type =
> > > > TEST_TYPE_CPU_MEM_COPY;
> > > > +			test_case->test_type_str = CPU_MEM_COPY;
> > > > +			is_dma = false;
> > > > +		} else {
> > > > +			printf("Error: Cannot find case type %s in case%d.\n",
> > > > case_type, i + 1);
> > > > +			test_case->is_valid = false;
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		nb_vp = 0;
> > > > +
> > > > +		test_case->src_numa_node =
> > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > +
> > > > 	section_name, "src_numa_node"));
> > > > +		test_case->dst_numa_node =
> > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > +
> > > > 	section_name, "dst_numa_node"));
> > > > +
> > > > +		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> > > > "mem_size");
> > > > +		args_nr = parse_entry(mem_size_str, &test_case-
> > > > >mem_size);
> > > > +		if (args_nr < 0) {
> > > > +			printf("parse error in case %d.\n", i + 1);
> > > > +			test_case->is_valid = false;
> > > > +			continue;
> > > > +		} else if (args_nr > 1)
> > > > +			nb_vp++;
> > > > +
> > > > +		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> > > > "buf_size");
> > > > +		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
> > > > +		if (args_nr < 0) {
> > > > +			printf("parse error in case %d.\n", i + 1);
> > > > +			test_case->is_valid = false;
> > > > +			continue;
> > > > +		} else if (args_nr > 1)
> > > > +			nb_vp++;
> > > > +
> > > > +		if (is_dma) {
> > > > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > section_name,
> > > > +
> > > > 	"dma_ring_size");
> > > > +			args_nr = parse_entry(ring_size_str, &test_case-
> > > > >ring_size);
> > > > +			if (args_nr < 0) {
> > > > +				printf("parse error in case %d.\n", i + 1);
> > > > +				test_case->is_valid = false;
> > > > +				continue;
> > > > +			} else if (args_nr > 1)
> > > > +				nb_vp++;
> > > > +
> > > > +			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
> > > > section_name, "kick_batch");
> > > > +			args_nr = parse_entry(kick_batch_str, &test_case-
> > > > >kick_batch);
> > > > +			if (args_nr < 0) {
> > > > +				printf("parse error in case %d.\n", i + 1);
> > > > +				test_case->is_valid = false;
> > > > +				continue;
> > > > +			} else if (args_nr > 1)
> > > > +				nb_vp++;
> > > > +
> > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > section_name, "lcore_dma");
> > > > +			int lcore_ret = parse_lcore_dma(test_case,
> > > > lcore_dma);
> > > > +			if (lcore_ret < 0) {
> > > > +				printf("parse lcore dma error in case %d.\n", i
> > > 1);
> > > > +				test_case->is_valid = false;
> > > > +				continue;
> > > > +			}
> > > > +		} else {
> > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > section_name, "lcore");
> > > > +			int lcore_ret = parse_lcore(test_case, lcore_dma);
> > > > +			if (lcore_ret < 0) {
> > > > +				printf("parse lcore error in case %d.\n", i + 1);
> > > > +				test_case->is_valid = false;
> > > > +				continue;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		if (nb_vp > 1) {
> > > > +			printf("Error, each section can only have a single
> > > > variable parameter.\n");
> > > > +			test_case->is_valid = false;
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		test_case->cache_flush =
> > > > +			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name,
> > > > "cache_flush"));
> > > > +		test_case->test_secs =
> > > > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > +					section_name, "test_seconds"));
> > > > +
> > > > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > > > section_name, "eal_args");
> > > > +		test_case->is_valid = true;
> > > > +	}
> > > > +
> > > > +	rte_cfgfile_close(cfgfile);
> > > > +	printf("config file parsing complete.\n\n");
> > > > +	return i;
> > > > +}
> > > > +
> > > > +/* Parse the argument given in the command line of the
> > > > +application */ static int append_eal_args(int argc, char **argv,
> > > > +const char *eal_args, char **new_argv) {
> > > > +	int i;
> > > > +	char *tokens[MAX_EAL_PARAM_NB];
> > > > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > > > +	int token_nb, new_argc = 0;
> > > > +
> > > > +	for (i = 0; i < argc; i++) {
> > > > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > > > +				(strcmp(argv[i], CMDLINE_RESULT_ARG) ==
> > > > 0)) {
> > > > +			i++;
> > > > +			continue;
> > > > +		}
> > > > +		strlcpy(new_argv[new_argc], argv[i],
> > > > sizeof(new_argv[new_argc]));
> > > > +		new_argc++;
> > > > +	}
> > > > +
> > > > +	if (eal_args) {
> > > > +		strlcpy(args, eal_args, sizeof(args));
> > > > +		token_nb = rte_strsplit(args, strlen(args),
> > > > +					tokens, MAX_EAL_PARAM_NB, ' ');
> > > > +		for (i = 0; i < token_nb; i++)
> > > > +			strcpy(new_argv[new_argc++], tokens[i]);
> > > > +	}
> > > > +
> > > > +	return new_argc;
> > > > +}
> > > > +
> > > > +int
> > > > +main(int argc, char *argv[])
> > > > +{
> > > > +	int ret;
> > > > +	uint16_t case_nb;
> > > > +	uint32_t i, nb_lcores;
> > > > +	pid_t cpid, wpid;
> > > > +	int wstatus;
> > > > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > > > +	char *pargs[MAX_EAL_PARAM_NB];
> > > > +	char *cfg_path_ptr = NULL;
> > > > +	char *rst_path_ptr = NULL;
> > > > +	char rst_path[PATH_MAX];
> > > > +	int new_argc;
> > > > +	bool is_first_case = true;
> > > > +
> > > > +	memset(args, 0, sizeof(args));
> > > > +
> > > > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > > > +		pargs[i] = args[i];
> > > > +
> > > > +	for (i = 0; i < (uint32_t)argc; i++) {
> > > > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > > > MAX_LONG_OPT_SZ) == 0)
> > > > +			cfg_path_ptr = argv[i + 1];
> > > > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > > > MAX_LONG_OPT_SZ) == 0)
> > > > +			rst_path_ptr = argv[i + 1];
> > > > +	}
> > > > +	if (cfg_path_ptr == NULL) {
> > > > +		printf("Config file not assigned.\n");
> > > > +		return -1;
> > > > +	}
> > > > +	if (rst_path_ptr == NULL) {
> > > > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > > > +		strcat(strtok(basename(rst_path), "."), "_result.csv");
> > > > +		rst_path_ptr = rst_path;
> > > > +	}
> > > > +
> > > > +	case_nb = load_configs(cfg_path_ptr);
> > > > +	fd = fopen(rst_path_ptr, "w");
> > > > +	if (fd == NULL) {
> > > > +		printf("Open output CSV file error.\n");
> > > > +		return -1;
> > > > +	}
> > > > +	fclose(fd);
> > > > +
> > > > +	for (i = 0; i < case_nb; i++) {
> > > > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > > > +			printf("No test type in test case %d.\n\n", i + 1);
> > > > +			continue;
> > > > +		}
> > > > +		if (!test_cases[i].is_valid) {
> > > > +			printf("Invalid test case %d.\n\n", i + 1);
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		cpid = fork();
> > >
> > > [Anoob] Do we really need fork()? Can't we use code like,
> > >
> > > 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> > > 			ret |= rte_eal_wait_lcore(lcore_id);
> > > 		}
> > >
> > > to wait for all threads to exit?
> >
> > [Cheng] Good question. Fork() is used here to establish a new process
> > for the new test case. In order for each test case to have a new EAL
> > environment (for the flexibility), the EAL must be reinitialized for each case.
> > However, the EAL parameters can only be initialized once per process.
> > Therefore, we use a new process to run each new test case. Moreover,
> > each test case runs sequentially and does not affect the others,
> > ensuring the accuracy of the performance data. Your code would wait
> > for all threads to exit in the same process. However, it would not provide a
> "clean"
> > environment for each test case like fork() does. Fork() allows us to
> > have a fully reinitialized environment, with no impact or side effects
> > from previous test cases. This results in clean, precise performance data for
> each case.
> >
> > Please let me know your thoughts on this. And please let me know if
> > you have any other questions or require any clarification.
> 
> [Anoob] This was just a generic observation. I do not have a strong opinion
> either way.
> 

[Cheng] sure, got it.

> >
> > Thanks,
> > Cheng
> >
> > >
> > > > +		if (cpid < 0) {
> > > > +			printf("Fork case %d failed.\n", i + 1);
> > > > +			exit(EXIT_FAILURE);
> > > > +		} else if (cpid == 0) {
> > > > +			printf("\nRunning case %u\n\n", i + 1);
> > > > +
> > > > +			new_argc = append_eal_args(argc, argv,
> > > > test_cases[i].eal_args, pargs);
> > > > +			ret = rte_eal_init(new_argc, pargs);
> > > > +			if (ret < 0)
> > > > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > > > arguments\n");
> > > > +
> > > > +			/* Check lcores. */
> > > > +			nb_lcores = rte_lcore_count();
> > > > +			if (nb_lcores < 2)
> > > > +				rte_exit(EXIT_FAILURE,
> > > > +					"There should be at least 2 worker
> > > > lcores.\n");
> > > > +
> > > > +			fd = fopen(rst_path_ptr, "a");
> > > > +			if (!fd) {
> > > > +				printf("Open output CSV file error.\n");
> > > > +				return 0;
> > > > +			}
> > > > +
> > > > +			if (is_first_case) {
> > > > +				output_env_info();
> > > > +				is_first_case = false;
> > > > +			}
> > > > +			run_test(i + 1, &test_cases[i]);
> > > > +
> > > > +			/* clean up the EAL */
> > > > +			rte_eal_cleanup();
> > > > +
> > > > +			fclose(fd);
> > > > +
> > > > +			printf("\nCase %u completed.\n\n", i + 1);
> > > > +
> > > > +			exit(EXIT_SUCCESS);
> > > > +		} else {
> > > > +			wpid = waitpid(cpid, &wstatus, 0);
> > > > +			if (wpid == -1) {
> > > > +				printf("waitpid error.\n");
> > > > +				exit(EXIT_FAILURE);
> > > > +			}
> > > > +
> > > > +			if (WIFEXITED(wstatus))
> > > > +				printf("Case process exited. status %d\n\n",
> > > > +					WEXITSTATUS(wstatus));
> > > > +			else if (WIFSIGNALED(wstatus))
> > > > +				printf("Case process killed by signal %d\n\n",
> > > > +					WTERMSIG(wstatus));
> > > > +			else if (WIFSTOPPED(wstatus))
> > > > +				printf("Case process stopped by signal
> > > > %d\n\n",
> > > > +					WSTOPSIG(wstatus));
> > > > +			else if (WIFCONTINUED(wstatus))
> > > > +				printf("Case process continued.\n\n");
> > > > +			else
> > > > +				printf("Case process unknown
> > > > terminated.\n\n");
> > > > +		}
> > > > +	}
> > > > +
> > > > +	printf("Bye...\n");
> > > > +	return 0;
> > > > +}
> > > > +
> > > > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> > > > new file mode 100644 index 0000000000..215ac42673
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/main.h
> > > > @@ -0,0 +1,69 @@
> > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > +
> > > > +#ifndef _MAIN_H_
> > > > +#define _MAIN_H_
> > > > +
> > > > +
> > > > +#include <rte_common.h>
> > > > +#include <rte_cycles.h>
> > > > +#include <rte_dev.h>
> > > > +#include <rte_dmadev.h>
> > > > +
> > > > +#ifndef __maybe_unused
> > > > +#define __maybe_unused	__rte_unused
> > > > +#endif
> > > > +
> > > > +#define MAX_WORKER_NB 128
> > > > +#define MAX_OUTPUT_STR_LEN 512
> > > > +
> > > > +#define MAX_DMA_NB 128
> > > > +#define MAX_LCORE_NB 256
> > > > +
> > > > +extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > +
> > > > +typedef enum {
> > > > +	OP_NONE = 0,
> > > > +	OP_ADD,
> > > > +	OP_MUL
> > > > +} alg_op_type;
> > > > +
> > > > +struct test_configure_entry {
> > > > +	uint32_t first;
> > > > +	uint32_t last;
> > > > +	uint32_t incr;
> > > > +	alg_op_type op;
> > > > +	uint32_t cur;
> > > > +};
> > > > +
> > > > +struct lcore_dma_map_t {
> > > > +	uint32_t lcores[MAX_WORKER_NB];
> > > > +	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > > > +	int16_t dma_ids[MAX_WORKER_NB];
> > > > +	uint16_t cnt;
> > > > +};
> > > > +
> > > > +struct test_configure {
> > > > +	bool is_valid;
> > > > +	uint8_t test_type;
> > > > +	const char *test_type_str;
> > > > +	uint16_t src_numa_node;
> > > > +	uint16_t dst_numa_node;
> > > > +	uint16_t opcode;
> > > > +	bool is_dma;
> > > > +	struct lcore_dma_map_t lcore_dma_map;
> > > > +	struct test_configure_entry mem_size;
> > > > +	struct test_configure_entry buf_size;
> > > > +	struct test_configure_entry ring_size;
> > > > +	struct test_configure_entry kick_batch;
> > > > +	uint32_t cache_flush;
> > > > +	uint32_t nr_buf;
> > > > +	uint16_t test_secs;
> > > > +	const char *eal_args;
> > > > +	uint8_t scenario_id;
> > > > +};
> > > > +
> > > > +void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> > > > +
> > > > +#endif /* _MAIN_H_ */
> > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/meson.build
> > > > @@ -0,0 +1,17 @@
> > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > > +Intel Corporation
> > > > +
> > > > +# meson file, for building this app as part of a main DPDK build.
> > > > +
> > > > +if is_windows
> > > > +    build = false
> > > > +    reason = 'not supported on Windows'
> > > > +    subdir_done()
> > > > +endif
> > > > +
> > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > +
> > > > +sources = files(
> > > > +        'main.c',
> > > > +        'benchmark.c',
> > > > +)
> > > > --
> > > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-15 14:05         ` Jiang, Cheng1
@ 2023-06-15 15:47           ` Anoob Joseph
  2023-06-16  2:56             ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-15 15:47 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Cheng,

Please see inline.

Thanks,
Anoob

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Thursday, June 15, 2023 7:36 PM
> To: Anoob Joseph <anoobj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Anoob,
> 
> Replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Thursday, June 15, 2023 4:45 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> application
> >
> > Hi Cheng,
> >
> > Please see inline.
> >
> > Thanks,
> > Anoob
> >
> > > -----Original Message-----
> > > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Sent: Thursday, June 15, 2023 1:31 PM
> > > To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> Amit
> > > Prakash Shukla <amitprakashs@marvell.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> > YuanX
> > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi,
> > >
> > > Thanks for your comments, the replies are inline.
> > >
> > > Thanks,
> > > Cheng
> > >
> > > > -----Original Message-----
> > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > Sent: Thursday, June 15, 2023 1:22 PM
> > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> > Amit
> > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang,
> > > YuanX
> > > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > Hi,
> > > >
> > > > Thanks for working on the comments. Few more top level comment
> > inline.
> > > >
> > > > Thanks,
> > > > Anoob
> > > >
> > > > > -----Original Message-----
> > > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > Sent: Tuesday, June 13, 2023 10:02 AM
> > > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash
> > > Shukla
> > > > > <amitprakashs@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> > > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > > xingguang.he@intel.com;
> > > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > application
> > > > >
> > > > > External Email
> > > > >
> > > > > ------------------------------------------------------------------
> > > > > --
> > > > > -- There are many high-performance DMA devices supported in DPDK
> > > > > now, and these DMA devices can also be integrated into other
> > > > > modules of DPDK as accelerators, such as Vhost. Before integrating
> > > > > DMA into applications, developers need to know the performance of
> > > > > these DMA devices in various scenarios and the performance of CPUs
> > > > > in the same scenario, such as different buffer lengths. Only in
> > > > > this way can we know the target performance of the application
> > > > > accelerated by using them. This patch introduces a
> > > > > high-performance testing tool, which supports comparing the
> > > > > performance of CPU and DMA in different scenarios automatically
> > > > > with a pre- set config file. Memory Copy performance test are
> > > > supported for now.
> > > > >
> > > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > > ---
> > > > > v6:
> > > > >   improved code based on Anoob's comments;
> > > > >   fixed some code structure issues;
> > > > > v5:
> > > > >   fixed some LONG_LINE warnings;
> > > > > v4:
> > > > >   fixed inaccuracy of the memory footprint display;
> > > > > v3:
> > > > >   fixed some typos;
> > > > > v2:
> > > > >   added lcore/dmadev designation;
> > > > >   added error case process;
> > > > >   removed worker_threads parameter from config.ini;
> > > > >   improved the logs;
> > > > >   improved config file;
> > > > >
> > > > >  app/meson.build               |   1 +
> > > > >  app/test-dma-perf/benchmark.c | 477
> > > ++++++++++++++++++++++++++++
> > > > > app/test-dma-perf/config.ini  |  59 ++++
> > > > >  app/test-dma-perf/main.c      | 569
> > > > > ++++++++++++++++++++++++++++++++++
> > > > >  app/test-dma-perf/main.h      |  69 +++++
> > > > >  app/test-dma-perf/meson.build |  17 +
> > > > >  6 files changed, 1192 insertions(+)  create mode 100644
> > > > > app/test-dma-perf/benchmark.c  create mode 100644
> > > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > > > mode
> > > > > 100644 app/test-dma-perf/meson.build
> > > > >
> > > > > diff --git a/app/meson.build b/app/meson.build index
> > > > > 74d2420f67..4fc1a83eba 100644
> > > > > --- a/app/meson.build
> > > > > +++ b/app/meson.build
> > > > > @@ -19,6 +19,7 @@ apps = [
> > > > >          'test-cmdline',
> > > > >          'test-compress-perf',
> > > > >          'test-crypto-perf',
> > > > > +        'test-dma-perf',
> > > > >          'test-eventdev',
> > > > >          'test-fib',
> > > > >          'test-flow-perf',
> > > > > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > > > > perf/benchmark.c new file mode 100644 index
> 0000000000..bc1ca82297
> > > > > --- /dev/null
> > > > > +++ b/app/test-dma-perf/benchmark.c
> > > > > @@ -0,0 +1,477 @@
> > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > +
> > > > > +#include <inttypes.h>
> > > > > +#include <stdio.h>
> > > > > +#include <stdlib.h>
> > > > > +#include <unistd.h>
> > > > > +
> > > > > +#include <rte_time.h>
> > > > > +#include <rte_mbuf.h>
> > > > > +#include <rte_dmadev.h>
> > > > > +#include <rte_malloc.h>
> > > > > +#include <rte_lcore.h>
> > > > > +
> > > > > +#include "main.h"
> > > > > +
> > > > > +#define MAX_DMA_CPL_NB 255
> > > > > +
> > > > > +#define TEST_WAIT_U_SECOND 10000
> > > > > +
> > > > > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%"
> > > PRIu64
> > > > > ",%.3lf,%.3lf\n"
> > > > > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%"
> > > PRIu64
> > > > > ",%.3lf,%.3lf\n"
> > > > > +
> > > > > +struct worker_info {
> > > > > +	bool ready_flag;
> > > > > +	bool start_flag;
> > > > > +	bool stop_flag;
> > > > > +	uint32_t total_cpl;
> > > > > +	uint32_t test_cpl;
> > > > > +};
> > > > > +
> > > > > +struct lcore_params {
> > > > > +	uint8_t scenario_id;
> > > > > +	unsigned int lcore_id;
> > > > > +	char *dma_name;
> > > > > +	uint16_t worker_id;
> > > > > +	uint16_t dev_id;
> > > > > +	uint32_t nr_buf;
> > > > > +	uint16_t kick_batch;
> > > > > +	uint32_t buf_size;
> > > > > +	uint16_t test_secs;
> > > > > +	struct rte_mbuf **srcs;
> > > > > +	struct rte_mbuf **dsts;
> > > > > +	struct worker_info worker_info;
> > > > > +};
> > > > > +
> > > > > +static struct rte_mempool *src_pool; static struct rte_mempool
> > > > > +*dst_pool;
> > > > > +
> > > > > +static volatile struct lcore_params
> > > *worker_params[MAX_WORKER_NB];
> > > > > +
> > > > > +#define PRINT_ERR(...) print_err(__func__, __LINE__,
> __VA_ARGS__)
> > > > > +
> > > > > +static inline int
> > > > > +__rte_format_printf(3, 4)
> > > > > +print_err(const char *func, int lineno, const char *format, ...) {
> > > > > +	va_list ap;
> > > > > +	int ret;
> > > > > +
> > > > > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > > > > +	va_start(ap, format);
> > > > > +	ret += vfprintf(stderr, format, ap);
> > > > > +	va_end(ap);
> > > > > +
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static inline void
> > > > > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t
> > > > > +nb_workers,
> > > > > uint16_t test_secs,
> > > > > +				uint32_t total_cnt, float *memory,
> uint32_t
> > > > > *ave_cycle,
> > > > > +				float *bandwidth, float *mops) {
> > > > > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) /
> (1024
> > > > > +*
> > > > > 1024);
> > > > > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > > > > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > > > > (float)*ave_cycle)) / 1000000000;

[Anoob] The above calculation may not yield actual results. 'ave_cycle' would get converted to integer and then bandwidth would be allowed to report only very few values. Instead, we can do the calculation directly like,

	*bandwidth = ((float)buf_size * 8 * total_cnt / test_secs) / 1000000000;
	*mops = (float)total_cnt / test_secs / 1000000;

Same issue is there with below calculation as well. Please check.

Side note: in bandwidth calculation, shouldn't we be dividing by 1024*1024*1024? I've just carried the calculation that you used. Feel free to correct as required.

> > > > > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > > > > +
> > > > > +static void
> > > > > +output_result(uint8_t scenario_id, uint32_t lcore_id, char
> > > > > +*dma_name,
> > > > > uint64_t ave_cycle,
> > > > > +			uint32_t buf_size, uint32_t nr_buf, float
> memory,
> > > > > +			float bandwidth, float mops, bool is_dma) {
> > > > > +	if (is_dma)
> > > > > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > > > > +	else
> > > > > +		printf("lcore %u\n", lcore_id);
> > > > > +
> > > > > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u,
> nr_buf:
> > > > > +%u,
> > > > > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > > > > +			ave_cycle, buf_size, nr_buf, memory,
> > > > > rte_get_timer_hz());
> > > > > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n",
> bandwidth,
> > > > > +mops);
> > > > > +
> > > > > +	if (is_dma)
> > > > > +		snprintf(output_str[lcore_id],
> MAX_OUTPUT_STR_LEN,
> > > > > CSV_LINE_DMA_FMT,
> > > > > +			scenario_id, lcore_id, dma_name, buf_size,
> > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> mops);
> > > > > +	else
> > > > > +		snprintf(output_str[lcore_id],
> MAX_OUTPUT_STR_LEN,
> > > > > CSV_LINE_CPU_FMT,
> > > > > +			scenario_id, lcore_id, buf_size,
> > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> mops); }
> > > > > +
> > > > > +static inline void
> > > > > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > > > > +		__maybe_unused uint32_t buf_size,
> > > > > +		__maybe_unused uint32_t nr_buf) { #ifdef
> > RTE_ARCH_X86_64
> > > > > +	char *data;
> > > > > +	struct rte_mbuf **srcs = array;
> > > > > +	uint32_t i, offset;
> > > > > +
> > > > > +	for (i = 0; i < nr_buf; i++) {
> > > > > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > > > > +		for (offset = 0; offset < buf_size; offset += 64)
> > > > > +			__builtin_ia32_clflush(data + offset);
> > > > > +	}
> > > > > +#endif
> > > > > +}
> > > > > +
> > > > > +/* Configuration of device. */
> > > > > +static void
> > > > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > > > > +	uint16_t vchan = 0;
> > > > > +	struct rte_dma_info info;
> > > > > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > > > +	struct rte_dma_vchan_conf qconf = {
> > > > > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > > > +		.nb_desc = ring_size
> > > > > +	};
> > > > > +
> > > > > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > > > > +		rte_exit(EXIT_FAILURE, "Error with dma
> configure.\n");
> > > > > +
> > > > > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > > > > +		rte_exit(EXIT_FAILURE, "Error with queue
> configuration.\n");
> > > > > +
> > > > > +	rte_dma_info_get(dev_id, &info);
> > > > > +	if (info.nb_vchans != 1)
> > > > > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > > > > reported on device id. %u\n",
> > > > > +				dev_id);
> > > > > +
> > > > > +	if (rte_dma_start(dev_id) != 0)
> > > > > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > > > > +
> > > > > +static int
> > > > > +config_dmadevs(struct test_configure *cfg) {
> > > > > +	uint32_t ring_size = cfg->ring_size.cur;
> > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > +	uint32_t nb_workers = ldm->cnt;
> > > > > +	uint32_t i;
> > > > > +	int dev_id;
> > > > > +	uint16_t nb_dmadevs = 0;
> > > > > +	char *dma_name;
> > > > > +
> > > > > +	for (i = 0; i < ldm->cnt; i++) {
> > > > > +		dma_name = ldm->dma_names[i];
> > > > > +		dev_id =
> rte_dma_get_dev_id_by_name(dma_name);
> > > > > +		if (dev_id == -1) {
> > > > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > > > dma_name);
> > > > > +			goto end;
> > > > > +		}
> > > > > +
> > > > > +		ldm->dma_ids[i] = dev_id;
> > > > > +		configure_dmadev_queue(dev_id, ring_size);
> > > > > +		++nb_dmadevs;
> > > > > +	}
> > > > > +
> > > > > +end:
> > > > > +	if (nb_dmadevs < nb_workers) {
> > > > > +		printf("Not enough dmadevs (%u) for all workers
> (%u).\n",
> > > > > nb_dmadevs, nb_workers);
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +#define POLL_MAX 1000
> > > > > +
> > > > > +
> > > >
> > > > [Anoob] Extra blank line. You can consider removing.
> > >
> > > [Cheng] sure, sorry for the miss.
> > >
> > > >
> > > > > +static inline void
> > > > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > > > > +			volatile struct worker_info *worker_info) {
> > > > > +	int ret;
> > > > > +	uint16_t nr_cpl;
> > > > > +
> > > > > +	ret = rte_dma_submit(dev_id, 0);
> > > > > +	if (ret < 0) {
> > > > > +		rte_dma_stop(dev_id);
> > > > > +		rte_dma_close(dev_id);
> > > > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > > > +	}
> > > > > +
> > > > > +	nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL,
> > > > > NULL);
> > > > > +	*async_cnt -= nr_cpl;
> > > > > +	worker_info->total_cpl += nr_cpl; }
> > > > > +
> > > > > +static inline int
> > > > > +do_dma_mem_copy(void *p)
> > > > > +{
> > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > +	volatile struct lcore_params *para =
> worker_params[*para_idx];
> > > > > +	volatile struct worker_info *worker_info = &(para-
> >worker_info);
> > > > > +	const uint16_t dev_id = para->dev_id;
> > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > +	const uint16_t kick_batch = para->kick_batch;
> > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > +	uint16_t nr_cpl;
> > > > > +	uint64_t async_cnt = 0;
> > > > > +	uint32_t i;
> > > > > +	uint32_t poll_cnt = 0;
> > > > > +	int ret;
> > > > > +
> > > > > +	worker_info->stop_flag = false;
> > > > > +	worker_info->ready_flag = true;
> > > > > +
> > > > > +	while (!worker_info->start_flag)
> > > > > +		;
> > > > > +
> > > > > +	while (1) {
> > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > +dma_copy:
> > > > > +			ret = rte_dma_copy(dev_id, 0,
> > > > > rte_pktmbuf_iova(srcs[i]),
> > > > > +				rte_pktmbuf_iova(dsts[i]), buf_size,
> 0);
> > > > > +			if (unlikely(ret < 0)) {
> > > > > +				if (ret == -ENOSPC) {
> > > > > +
> 	do_dma_submit_and_poll(dev_id,
> > > > > &async_cnt, worker_info);
> > > > > +					goto dma_copy;
> > > > > +				} else {
> > > > > +					/* Error exit */
> > > > > +					rte_dma_stop(dev_id);
> > > > > +					rte_exit(EXIT_FAILURE,
> "DMA
> > > > > enqueue failed\n");
> > > > > +				}
> > > > > +			}
> > > > > +			async_cnt++;
> > > > > +
> > > > > +			if ((async_cnt % kick_batch) == 0)
> > > > > +				do_dma_submit_and_poll(dev_id,
> > > > > &async_cnt, worker_info);
> > > > > +		}
> > > > > +
> > > > > +		if (worker_info->stop_flag)
> > > > > +			break;
> > > > > +	}
> > > > > +
> > > > > +	rte_dma_submit(dev_id, 0);
> > > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > > +		async_cnt -= nr_cpl;
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static inline int
> > > > > +do_cpu_mem_copy(void *p)
> > > > > +{
> > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > +	volatile struct lcore_params *para =
> worker_params[*para_idx];
> > > > > +	volatile struct worker_info *worker_info = &(para-
> >worker_info);
> > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > +	uint32_t i;
> > > > > +
> > > > > +	worker_info->stop_flag = false;
> > > > > +	worker_info->ready_flag = true;
> > > > > +
> > > > > +	while (!worker_info->start_flag)
> > > > > +		;
> > > > > +
> > > > > +	while (1) {
> > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > +			/* copy buffer form src to dst */
> > > > > +			rte_memcpy((void
> > > > > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > > > > +				(void
> > > > > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > > > > +				(size_t)buf_size);
> > > > > +			worker_info->total_cpl++;
> > > > > +		}
> > > > > +		if (worker_info->stop_flag)
> > > > > +			break;
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf
> > > ***srcs,
> > > > > +			struct rte_mbuf ***dsts)
> > > > > +{
> > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > +	unsigned int nr_sockets;
> > > > > +	uint32_t nr_buf = cfg->nr_buf;
> > > > > +
> > > > > +	nr_sockets = rte_socket_count();
> > > > > +	if (cfg->src_numa_node >= nr_sockets ||
> > > > > +		cfg->dst_numa_node >= nr_sockets) {
> > > > > +		printf("Error: Source or destination numa exceeds
> the acture
> > > > > numa nodes.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	src_pool =
> rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > > > > +			nr_buf, /* n == num elements */
> > > > > +			64,  /* cache size */
> > > > > +			0,   /* priv size */
> > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > +			cfg->src_numa_node);
> > > > > +	if (src_pool == NULL) {
> > > > > +		PRINT_ERR("Error with source mempool
> creation.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	dst_pool =
> rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > > > > +			nr_buf, /* n == num elements */
> > > > > +			64,  /* cache size */
> > > >
> > > > [Anoob] We do not alloc or free pointers in the datapath, right? So
> > > > why bother with cache?
> > >
> > > [Cheng] Yes, you are right, the cache size is not necessary here, I'll
> > > fix it in the next version.
> > >
> > > >
> > > > > +			0,   /* priv size */
> > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > +			cfg->dst_numa_node);
> > > > > +	if (dst_pool == NULL) {
> > > > > +		PRINT_ERR("Error with destination mempool
> creation.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> 0);
> > > > > +	if (*srcs == NULL) {
> > > > > +		printf("Error: srcs malloc failed.\n");
> > > > > +		return -1;
> > > > > +	}
> > > >
> > > > [Anoob] Are we freeing these memory? The ones allocated with
> > > rte_malloc.
> > >
> > > [Cheng] yes, we freed the memory in the end of
> mem_copy_benchmark()
> > > when we finished the test.
> >
> > [Anoob] I think we are not freeing this mem. In the place where we free all
> > mem, we do free all objects to mempool as well as the mempools. But this
> > memory is to hold the pointers, right? Is that getting freed anywhere?
> >
> > Also, in the mem clearing paths, do we need to clear the static variables (ie,
> > set srcs, src_pool, dsts, dst_pool to NULL) so that there won't be any scope
> > for any double free.
> >
> 
> [Cheng] My apologies for the misunderstanding earlier. I now understand
> your point that you are right, the memory used to store the pointers is not
> being freed. I will fix this issue in the next version. Regarding the static
> variables you mentioned, I agree with your view that they should be cleared.
> I will address this in the upcoming version as well. Thank you very much for
> the feedback. It is greatly appreciated.
> 
> In addition, I think we also need to nullify these variables when initializing
> them to ensure safety and standardization of use. What do you think?

[Anoob] Since these are static variables, it is probably okay to skip the init part. But when we use it, we should clear it after use.

Please check above. I've posted one more comment. In case you missed.

> 
> Thanks!
> 
> > >
> > > >
> > > > > +
> > > > > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> 0);
> > > > > +	if (*dsts == NULL) {
> > > > > +		printf("Error: dsts malloc failed.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf)
> != 0) {
> > > > > +		printf("get src mbufs failed.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf)
> != 0) {
> > > > > +		printf("get dst mbufs failed.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > > > +	uint16_t i;
> > > > > +	uint32_t offset;
> > > > > +	unsigned int lcore_id = 0;
> > > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > > +1024) /
> > > > > (cfg->buf_size.cur * 2);
> > > > > +	uint16_t nb_workers = ldm->cnt;
> > > > > +	uint16_t test_secs = cfg->test_secs;
> > > > > +	float memory;
> > > > > +	uint32_t avg_cycles = 0;
> > > > > +	float mops;
> > > > > +	float bandwidth;
> > > > > +
> > > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (is_dma)
> > > > > +		if (config_dmadevs(cfg) < 0)
> > > > > +			goto out;
> > > > > +
> > > > > +	if (cfg->cache_flush) {
> > > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > > +		rte_mb();
> > > > > +	}
> > > > > +
> > > > > +	printf("Start testing....\n");
> > > > > +
> > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > +		lcore_id = ldm->lcores[i];
> > > > > +		offset = nr_buf / nb_workers * i;
> > > > > +
> > > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > > lcore_params), 0);
> > > > > +		if (!worker_params[i]) {
> > > > > +			printf("lcore parameters malloc failure for
> lcore
> > > > > %d\n", lcore_id);
> > > > > +			break;
> > > > > +		}
> > > >
> > > > [Anoob] Are we freeing the above memory?
> > >
> > > [Cheng] sorry, I missed that, I'll add worker_params memory free in
> > > the next version, thanks.
> > >
> > > >
> > > > > +		if (is_dma) {
> > > > > +			worker_params[i]->dma_name = ldm-
> > > > > >dma_names[i];
> > > > > +			worker_params[i]->dev_id = ldm-
> >dma_ids[i];
> > > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > > +		}
> > > > > +		worker_params[i]->worker_id = i;
> > > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > > nb_workers);
> > > > > +		worker_params[i]->buf_size = buf_size;
> > > > > +		worker_params[i]->test_secs = test_secs;
> > > > > +		worker_params[i]->srcs = srcs + offset;
> > > > > +		worker_params[i]->dsts = dsts + offset;
> > > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > > +
> > > > > +		if (is_dma)
> > > > > +
> 	rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > > *)(&i), lcore_id);
> > > > > +		else
> > > > > +			rte_eal_remote_launch(do_cpu_mem_copy,
> (void
> > > > > *)(&i), lcore_id);
> > > > > +	}
> > > > > +
> > > > > +	while (1) {
> > > > > +		bool ready = true;
> > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > +			if (worker_params[i]-
> >worker_info.ready_flag ==
> > > > > false) {
> > > > > +				ready = 0;
> > > > > +				break;
> > > > > +			}
> > > > > +		}
> > > > > +		if (ready)
> > > > > +			break;
> > > > > +	}
> > > > > +
> > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > > +
> > > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > +worker_params[i]->worker_info.total_cpl;
> > > > > +
> > > > > +	usleep(test_secs * 1000 * 1000);
> > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > worker_params[i]->worker_info.total_cpl -
> > > > > +						worker_params[i]-
> > > > > >worker_info.test_cpl;
> > > > > +
> > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > > +
> > > > > +	rte_eal_mp_wait_lcore();
> > > > > +
> > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > > +			worker_params[i]->worker_info.test_cpl,
> > > > > +			&memory, &avg_cycles, &bandwidth,
> &mops);
> > > > > +		output_result(cfg->scenario_id, worker_params[i]-
> >lcore_id,
> > > > > +					worker_params[i]-
> >dma_name,
> > > > > avg_cycles, buf_size,
> > > > > +					nr_buf / nb_workers,
> memory,
> > > > > bandwidth, mops, is_dma);
> > > > > +	}
> > > > > +
> > > > > +out:
> > > > > +	/* free env */
> > > > > +	if (srcs)
> > > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > > +	if (dsts)
> > > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > > +
> > > > > +	if (src_pool)
> > > > > +		rte_mempool_free(src_pool);
> > > > > +	if (dst_pool)
> > > > > +		rte_mempool_free(dst_pool);
> > > > > +
> > > > > +	if (is_dma) {
> > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > +			printf("Stopping dmadev %d\n", ldm-
> >dma_ids[i]);
> > > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > > > > +		}
> > > > > +	}
> > > > > +}
> > > > > diff --git a/app/test-dma-perf/config.ini
> > > > > b/app/test-dma-perf/config.ini new file mode 100644 index
> > > > > 0000000000..2fd9c3c387
> > > > > --- /dev/null
> > > > > +++ b/app/test-dma-perf/config.ini
> > > > > @@ -0,0 +1,59 @@
> > > > > +
> > > > > +; This is an example configuration file for dma-perf, which
> > > > > +details the meanings of each parameter ; and instructions on how
> > > > > +to use dma-
> > > perf.
> > > > > +
> > > > > +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
> > > > > +
> > > > > +; Parameters:
> > > > > +; "mem_size" denotes the size of the memory footprint.
> > > > > +; "buf_size" denotes the memory size of a single operation.
> > > > > +; "dma_ring_size" denotes the dma ring buffer size. It should be
> > > > > +greater
> > > > > than 64 normally.
> > > > > +; "kick_batch" denotes the dma operation batch size, and should
> > > > > +be greater
> > > > > than 1 normally.
> > > > > +
> > > > > +; The format for variables is variable=first,last,increment,ADD|MUL.
> > > > > +
> > > > > +; src_numa_node is used to control the numa node where the
> source
> > > > > memory is allocated.
> > > > > +; dst_numa_node is used to control the numa node where the
> > > > > +destination
> > > > > memory is allocated.
> > > > > +
> > > > > +; cache_flush is used to determine whether or not the cache
> > > > > +should be flushed, with 1 indicating to ; flush and 0 indicating to not
> > flush.
> > > > > +
> > > > > +; test_seconds controls the test time of the whole case.
> > > > > +
> > > > > +; To use DMA for a test, please specify the "lcore_dma" parameter.
> > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > +; make sure that the value of "lcore_dma" falls within their
> > > > > +range of the
> > > > > values.
> > > > > +
> > > > > +; To use CPU for a test, please specify the "lcore" parameter.
> > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > +; make sure that the value of "lcore" falls within their range of
> values.
> > > > > +
> > > > > +; To specify a configuration file, use the "--config" flag
> > > > > +followed by the path
> > > > > to the file.
> > > > > +
> > > > > +; To specify a result file, use the "--result" flag followed by
> > > > > +the path to the
> > > > > file.
> > > > > +; If you do not specify a result file, one will be generated with
> > > > > +the same name as the configuration ; file, with the addition of
> > > > > +"_result.csv" at
> > > > > the end.
> > > > > +
> > > > > +[case1]
> > > > > +type=DMA_MEM_COPY
> > > > > +mem_size=10
> > > > > +buf_size=64,8192,2,MUL
> > > > > +dma_ring_size=1024
> > > > > +kick_batch=32
> > > > > +src_numa_node=0
> > > > > +dst_numa_node=0
> > > > > +cache_flush=0
> > > > > +test_seconds=2
> > > > > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > > >
> > > > [Anoob] Isn't it better if we allow user to specify DMA dev ID
> > > > rather than the PCI DBDF?
> > > >
> > > > In the long run, I would expect config file to provide {core,
> > > > dma_dev_id, queue_id}
> > > >
> > > > Another thought is why to expose this at all? If we can restrict
> > > > this perf application to have one thread only use one vchan, then
> > > > application can easily create this mapping in run time. Unless you
> > > > want one thread to use 2 different vchans which may not be desirable
> > > since this is a standalone perf app.
> > >
> > > [Cheng] Thank you for the feedback.
> > > Here are my thoughts:
> > > Firstly, the user may not know which device the DMA dev ID corresponds
> > > to, or which NUMA node it is on. In my example, I used the CBDMA
> > > environment, so I did not specify the work queue ID. When using DSA,
> > > the configuration would be something like lcore10@0000:00:04.2-q0
> > > which contains core, dma and work queue id. The reason for exposing
> > > these options is that we want the user to fully understand which cores
> > > and devices are being used so that they know exactly where the
> > > performance data is coming from. For example, performance when cores
> > > and DMA devices are not on the same NUMA node, etc. This allows the
> > > testing scenario to be precise and flexible. If the application
> > > handles the mapping itself, the user loses control over the mapping
> > > and may not get the performance data they want. We believe control
> > > should be given to the user rather than the application.
> >
> > [Anoob] I understand your view points. Thanks for the explanation.
> >
> 
> [Cheng] sure, no problem.
> 
> > >
> > > >
> > > > > +eal_args=--in-memory --file-prefix=test
> > > > > +
> > > > > +[case2]
> > > > > +type=CPU_MEM_COPY
> > > > > +mem_size=10
> > > > > +buf_size=64,8192,2,MUL
> > > > > +src_numa_node=0
> > > > > +dst_numa_node=1
> > > > > +cache_flush=0
> > > > > +test_seconds=2
> > > > > +lcore = 3, 4
> > > > > +eal_args=--in-memory --no-pci
> > > > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > > > > new file mode 100644 index 0000000000..d65655b87b
> > > > > --- /dev/null
> > > > > +++ b/app/test-dma-perf/main.c
> > > > > @@ -0,0 +1,569 @@
> > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > +
> > > > > +#include <stdio.h>
> > > > > +#include <stdlib.h>
> > > > > +#include <getopt.h>
> > > > > +#include <signal.h>
> > > > > +#include <stdbool.h>
> > > > > +#include <unistd.h>
> > > > > +#include <sys/wait.h>
> > > > > +#include <inttypes.h>
> > > > > +#include <libgen.h>
> > > > > +
> > > > > +#include <rte_eal.h>
> > > > > +#include <rte_cfgfile.h>
> > > > > +#include <rte_string_fns.h>
> > > > > +#include <rte_lcore.h>
> > > > > +
> > > > > +#include "main.h"
> > > > > +
> > > > > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > > > > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > > > > +
> > > > > +#define MAX_EAL_PARAM_NB 100
> > > > > +#define MAX_EAL_PARAM_LEN 1024
> > > > > +
> > > > > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > > > > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > > > > +
> > > > > +#define CMDLINE_CONFIG_ARG "--config"
> > > > > +#define CMDLINE_RESULT_ARG "--result"
> > > > > +
> > > > > +#define MAX_PARAMS_PER_ENTRY 4
> > > > > +
> > > > > +#define MAX_LONG_OPT_SZ 64
> > > > > +
> > > > > +enum {
> > > > > +	TEST_TYPE_NONE = 0,
> > > > > +	TEST_TYPE_DMA_MEM_COPY,
> > > > > +	TEST_TYPE_CPU_MEM_COPY
> > > > > +};
> > > > > +
> > > > > +#define MAX_TEST_CASES 16
> > > > > +static struct test_configure test_cases[MAX_TEST_CASES];
> > > > > +
> > > > > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > +
> > > > > +static FILE *fd;
> > > > > +
> > > > > +static void
> > > > > +output_csv(bool need_blankline)
> > > > > +{
> > > > > +	uint32_t i;
> > > > > +
> > > > > +	if (need_blankline) {
> > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > +	}
> > > > > +
> > > > > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > > > > +		if (output_str[i][0]) {
> > > > > +			fprintf(fd, "%s", output_str[i]);
> > > > > +			output_str[i][0] = '\0';
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	fflush(fd);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +output_env_info(void)
> > > > > +{
> > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > > > > environment:\n");
> > > > > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU
> frequency,%"
> > > > > +			PRIu64 "\n", rte_get_timer_hz());
> > > > > +
> > > > > +	output_csv(true);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > > > > +			CSV_HDR_FMT, case_id, case_cfg-
> >test_type_str);
> > > > > +
> > > > > +	output_csv(true);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +run_test_case(struct test_configure *case_cfg) {
> > > > > +	switch (case_cfg->test_type) {
> > > > > +	case TEST_TYPE_DMA_MEM_COPY:
> > > > > +		mem_copy_benchmark(case_cfg, true);
> > > > > +		break;
> > > > > +	case TEST_TYPE_CPU_MEM_COPY:
> > > > > +		mem_copy_benchmark(case_cfg, false);
> > > > > +		break;
> > > > > +	default:
> > > > > +		printf("Unknown test type. %s\n", case_cfg-
> >test_type_str);
> > > > > +		break;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > > > > +	uint32_t i;
> > > > > +	uint32_t nb_lcores = rte_lcore_count();
> > > > > +	struct test_configure_entry *mem_size = &case_cfg-
> >mem_size;
> > > > > +	struct test_configure_entry *buf_size = &case_cfg-
> >buf_size;
> > > > > +	struct test_configure_entry *ring_size = &case_cfg-
> >ring_size;
> > > > > +	struct test_configure_entry *kick_batch = &case_cfg-
> >kick_batch;
> > > > > +	struct test_configure_entry dummy = { 0 };
> > > > > +	struct test_configure_entry *var_entry = &dummy;
> > > > > +
> > > > > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > > > > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > > > > +
> > > > > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > > > > +		printf("Case %u: Not enough lcores.\n", case_id);
> > > > > +		return;
> > > > > +	}
> > > > > +
> > > > > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > > > > +
> > > > > +	if (mem_size->incr != 0)
> > > > > +		var_entry = mem_size;
> > > > > +
> > > > > +	if (buf_size->incr != 0)
> > > > > +		var_entry = buf_size;
> > > > > +
> > > > > +	if (ring_size->incr != 0)
> > > > > +		var_entry = ring_size;
> > > > > +
> > > > > +	if (kick_batch->incr != 0)
> > > > > +		var_entry = kick_batch;
> > > > > +
> > > > > +	case_cfg->scenario_id = 0;
> > > > > +
> > > > > +	output_header(case_id, case_cfg);
> > > > > +
> > > > > +	for (var_entry->cur = var_entry->first; var_entry->cur <=
> > > > > +var_entry-
> > > > > >last;) {
> > > > > +		case_cfg->scenario_id++;
> > > > > +		printf("\nRunning scenario %d\n", case_cfg-
> >scenario_id);
> > > > > +
> > > > > +		run_test_case(case_cfg);
> > > > > +		output_csv(false);
> > > > > +
> > > > > +		if (var_entry->op == OP_ADD)
> > > > > +			var_entry->cur += var_entry->incr;
> > > > > +		else if (var_entry->op == OP_MUL)
> > > > > +			var_entry->cur *= var_entry->incr;
> > > > > +		else
> > > > > +			break;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +parse_lcore(struct test_configure *test_case, const char *value) {
> > > > > +	size_t len = strlen(value);
> > > > > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > > > > +	strcpy(input, value);
> > > > > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > > > > >lcore_dma_map);
> > > > > +
> > > > > +	if (test_case == NULL || value == NULL)
> > > > > +		return -1;
> > > > > +
> > > > > +	memset(lcore_dma_map, 0, sizeof(struct
> lcore_dma_map_t));
> > > > > +
> > > > > +	char *token = strtok(input, ", ");
> > > > > +	while (token != NULL) {
> > > > > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > > > > +			free(input);
> > > > > +			return -1;
> > > > > +		}
> > > > > +
> > > > > +		uint16_t lcore_id = atoi(token);
> > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] =
> lcore_id;
> > > > > +
> > > > > +		token = strtok(NULL, ", ");
> > > > > +	}
> > > > > +
> > > > > +	free(input);
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +parse_lcore_dma(struct test_configure *test_case, const char
> *value)
> > {
> > > > > +	struct lcore_dma_map_t *lcore_dma_map;
> > > > > +	char *input = strndup(value, strlen(value) + 1);
> > > > > +	char *addrs = input;
> > > > > +	char *ptrs[2];
> > > > > +	char *start, *end, *substr;
> > > > > +	uint16_t lcore_id;
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	while (*addrs == '\0')
> > > > > +		addrs++;
> > > > > +	if (*addrs == '\0') {
> > > > > +		fprintf(stderr, "No input DMA addresses\n");
> > > > > +		ret = -1;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	substr = strtok(addrs, ",");
> > > > > +	if (substr == NULL) {
> > > > > +		fprintf(stderr, "No input DMA address\n");
> > > > > +		ret = -1;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > > > > lcore_dma_map_t));
> > > > > +
> > > > > +	do {
> > > > > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > > > > +
> > > > > +		start = strstr(ptrs[0], "lcore");
> > > > > +		if (start == NULL) {
> > > > > +			fprintf(stderr, "Illegal lcore\n");
> > > > > +			ret = -1;
> > > > > +			break;
> > > > > +		}
> > > > > +
> > > > > +		start += 5;
> > > > > +		lcore_id = strtol(start, &end, 0);
> > > > > +		if (end == start) {
> > > > > +			fprintf(stderr, "No input lcore ID or ID %d is
> > > > > wrong\n", lcore_id);
> > > > > +			ret = -1;
> > > > > +			break;
> > > > > +		}
> > > > > +
> > > > > +		lcore_dma_map = &test_case->lcore_dma_map;
> > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt] =
> lcore_id;
> > > > > +		strcpy(lcore_dma_map-
> >dma_names[lcore_dma_map-
> > > > > >cnt], ptrs[1]);
> > > > > +		lcore_dma_map->cnt++;
> > > > > +		substr = strtok(NULL, ",");
> > > > > +	} while (substr != NULL);
> > > > > +
> > > > > +out:
> > > > > +	free(input);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +parse_entry(const char *value, struct test_configure_entry *entry)
> {
> > > > > +	char input[255] = {0};
> > > > > +	char *args[MAX_PARAMS_PER_ENTRY];
> > > > > +	int args_nr = -1;
> > > > > +
> > > > > +	if (value == NULL || entry == NULL)
> > > > > +		goto out;
> > > > > +
> > > > > +	strncpy(input, value, 254);
> > > > > +	if (*input == '\0')
> > > > > +		goto out;
> > > > > +
> > > > > +	args_nr = rte_strsplit(input, strlen(input), args,
> > > > > MAX_PARAMS_PER_ENTRY, ',');
> > > > > +	if (args_nr != 1 && args_nr != 4)
> > > > > +		goto out;
> > > > > +
> > > > > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > > > > +
> > > > > +	if (args_nr == 4) {
> > > > > +		entry->last = (uint32_t)atoi(args[1]);
> > > > > +		entry->incr = (uint32_t)atoi(args[2]);
> > > > > +		if (!strcmp(args[3], "MUL"))
> > > > > +			entry->op = OP_MUL;
> > > > > +		else if (!strcmp(args[3], "ADD"))
> > > > > +			entry->op = OP_ADD;
> > > > > +		else {
> > > > > +			printf("Invalid op %s.\n", args[3]);
> > > > > +			args_nr = -1;
> > > > > +		}
> > > > > +	} else {
> > > > > +		entry->op = OP_NONE;
> > > > > +		entry->last = 0;
> > > > > +		entry->incr = 0;
> > > > > +	}
> > > > > +out:
> > > > > +	return args_nr;
> > > > > +}
> > > > > +
> > > > > +static uint16_t
> > > > > +load_configs(const char *path)
> > > > > +{
> > > > > +	struct rte_cfgfile *cfgfile;
> > > > > +	int nb_sections, i;
> > > > > +	struct test_configure *test_case;
> > > > > +	char section_name[CFG_NAME_LEN];
> > > > > +	const char *case_type;
> > > > > +	const char *lcore_dma;
> > > > > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > > > > *kick_batch_str;
> > > > > +	int args_nr, nb_vp;
> > > > > +	bool is_dma;
> > > > > +
> > > > > +	printf("config file parsing...\n");
> > > > > +	cfgfile = rte_cfgfile_load(path, 0);
> > > > > +	if (!cfgfile) {
> > > > > +		printf("Open configure file error.\n");
> > > > > +		exit(1);
> > > > > +	}
> > > > > +
> > > > > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > > > > +	if (nb_sections > MAX_TEST_CASES) {
> > > > > +		printf("Error: The maximum number of cases is
> %d.\n",
> > > > > MAX_TEST_CASES);
> > > > > +		exit(1);
> > > > > +	}
> > > > > +
> > > > > +	for (i = 0; i < nb_sections; i++) {
> > > > > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i
> + 1);
> > > > > +		test_case = &test_cases[i];
> > > > > +		case_type = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > > > > "type");
> > > > > +		if (!case_type) {
> > > > > +			printf("Error: No case type in case %d, the
> test will be
> > > > > finished here.\n",
> > > > > +				i + 1);
> > > > > +			test_case->is_valid = false;
> > > > > +			continue;
> > > > > +		}
> > > > > +
> > > > > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > > > > +			test_case->test_type =
> > > > > TEST_TYPE_DMA_MEM_COPY;
> > > > > +			test_case->test_type_str =
> DMA_MEM_COPY;
> > > > > +			is_dma = true;
> > > > > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > > > > +			test_case->test_type =
> > > > > TEST_TYPE_CPU_MEM_COPY;
> > > > > +			test_case->test_type_str =
> CPU_MEM_COPY;
> > > > > +			is_dma = false;
> > > > > +		} else {
> > > > > +			printf("Error: Cannot find case type %s in
> case%d.\n",
> > > > > case_type, i + 1);
> > > > > +			test_case->is_valid = false;
> > > > > +			continue;
> > > > > +		}
> > > > > +
> > > > > +		nb_vp = 0;
> > > > > +
> > > > > +		test_case->src_numa_node =
> > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > +
> > > > > 	section_name, "src_numa_node"));
> > > > > +		test_case->dst_numa_node =
> > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > +
> > > > > 	section_name, "dst_numa_node"));
> > > > > +
> > > > > +		mem_size_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > > > > "mem_size");
> > > > > +		args_nr = parse_entry(mem_size_str, &test_case-
> > > > > >mem_size);
> > > > > +		if (args_nr < 0) {
> > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > +			test_case->is_valid = false;
> > > > > +			continue;
> > > > > +		} else if (args_nr > 1)
> > > > > +			nb_vp++;
> > > > > +
> > > > > +		buf_size_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > > > > "buf_size");
> > > > > +		args_nr = parse_entry(buf_size_str, &test_case-
> >buf_size);
> > > > > +		if (args_nr < 0) {
> > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > +			test_case->is_valid = false;
> > > > > +			continue;
> > > > > +		} else if (args_nr > 1)
> > > > > +			nb_vp++;
> > > > > +
> > > > > +		if (is_dma) {
> > > > > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > > section_name,
> > > > > +
> > > > > 	"dma_ring_size");
> > > > > +			args_nr = parse_entry(ring_size_str,
> &test_case-
> > > > > >ring_size);
> > > > > +			if (args_nr < 0) {
> > > > > +				printf("parse error in case %d.\n", i +
> 1);
> > > > > +				test_case->is_valid = false;
> > > > > +				continue;
> > > > > +			} else if (args_nr > 1)
> > > > > +				nb_vp++;
> > > > > +
> > > > > +			kick_batch_str =
> rte_cfgfile_get_entry(cfgfile,
> > > > > section_name, "kick_batch");
> > > > > +			args_nr = parse_entry(kick_batch_str,
> &test_case-
> > > > > >kick_batch);
> > > > > +			if (args_nr < 0) {
> > > > > +				printf("parse error in case %d.\n", i +
> 1);
> > > > > +				test_case->is_valid = false;
> > > > > +				continue;
> > > > > +			} else if (args_nr > 1)
> > > > > +				nb_vp++;
> > > > > +
> > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > section_name, "lcore_dma");
> > > > > +			int lcore_ret = parse_lcore_dma(test_case,
> > > > > lcore_dma);
> > > > > +			if (lcore_ret < 0) {
> > > > > +				printf("parse lcore dma error in case
> %d.\n", i
> > > > 1);
> > > > > +				test_case->is_valid = false;
> > > > > +				continue;
> > > > > +			}
> > > > > +		} else {
> > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > section_name, "lcore");
> > > > > +			int lcore_ret = parse_lcore(test_case,
> lcore_dma);
> > > > > +			if (lcore_ret < 0) {
> > > > > +				printf("parse lcore error in case
> %d.\n", i + 1);
> > > > > +				test_case->is_valid = false;
> > > > > +				continue;
> > > > > +			}
> > > > > +		}
> > > > > +
> > > > > +		if (nb_vp > 1) {
> > > > > +			printf("Error, each section can only have a
> single
> > > > > variable parameter.\n");
> > > > > +			test_case->is_valid = false;
> > > > > +			continue;
> > > > > +		}
> > > > > +
> > > > > +		test_case->cache_flush =
> > > > > +			(int)atoi(rte_cfgfile_get_entry(cfgfile,
> section_name,
> > > > > "cache_flush"));
> > > > > +		test_case->test_secs =
> > > > > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > +					section_name,
> "test_seconds"));
> > > > > +
> > > > > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > > > > section_name, "eal_args");
> > > > > +		test_case->is_valid = true;
> > > > > +	}
> > > > > +
> > > > > +	rte_cfgfile_close(cfgfile);
> > > > > +	printf("config file parsing complete.\n\n");
> > > > > +	return i;
> > > > > +}
> > > > > +
> > > > > +/* Parse the argument given in the command line of the
> > > > > +application */ static int append_eal_args(int argc, char **argv,
> > > > > +const char *eal_args, char **new_argv) {
> > > > > +	int i;
> > > > > +	char *tokens[MAX_EAL_PARAM_NB];
> > > > > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > > > > +	int token_nb, new_argc = 0;
> > > > > +
> > > > > +	for (i = 0; i < argc; i++) {
> > > > > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > > > > +				(strcmp(argv[i],
> CMDLINE_RESULT_ARG) ==
> > > > > 0)) {
> > > > > +			i++;
> > > > > +			continue;
> > > > > +		}
> > > > > +		strlcpy(new_argv[new_argc], argv[i],
> > > > > sizeof(new_argv[new_argc]));
> > > > > +		new_argc++;
> > > > > +	}
> > > > > +
> > > > > +	if (eal_args) {
> > > > > +		strlcpy(args, eal_args, sizeof(args));
> > > > > +		token_nb = rte_strsplit(args, strlen(args),
> > > > > +					tokens,
> MAX_EAL_PARAM_NB, ' ');
> > > > > +		for (i = 0; i < token_nb; i++)
> > > > > +			strcpy(new_argv[new_argc++], tokens[i]);
> > > > > +	}
> > > > > +
> > > > > +	return new_argc;
> > > > > +}
> > > > > +
> > > > > +int
> > > > > +main(int argc, char *argv[])
> > > > > +{
> > > > > +	int ret;
> > > > > +	uint16_t case_nb;
> > > > > +	uint32_t i, nb_lcores;
> > > > > +	pid_t cpid, wpid;
> > > > > +	int wstatus;
> > > > > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > > > > +	char *pargs[MAX_EAL_PARAM_NB];
> > > > > +	char *cfg_path_ptr = NULL;
> > > > > +	char *rst_path_ptr = NULL;
> > > > > +	char rst_path[PATH_MAX];
> > > > > +	int new_argc;
> > > > > +	bool is_first_case = true;
> > > > > +
> > > > > +	memset(args, 0, sizeof(args));
> > > > > +
> > > > > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > > > > +		pargs[i] = args[i];
> > > > > +
> > > > > +	for (i = 0; i < (uint32_t)argc; i++) {
> > > > > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > +			cfg_path_ptr = argv[i + 1];
> > > > > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > +			rst_path_ptr = argv[i + 1];
> > > > > +	}
> > > > > +	if (cfg_path_ptr == NULL) {
> > > > > +		printf("Config file not assigned.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +	if (rst_path_ptr == NULL) {
> > > > > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > > > > +		strcat(strtok(basename(rst_path), "."),
> "_result.csv");
> > > > > +		rst_path_ptr = rst_path;
> > > > > +	}
> > > > > +
> > > > > +	case_nb = load_configs(cfg_path_ptr);
> > > > > +	fd = fopen(rst_path_ptr, "w");
> > > > > +	if (fd == NULL) {
> > > > > +		printf("Open output CSV file error.\n");
> > > > > +		return -1;
> > > > > +	}
> > > > > +	fclose(fd);
> > > > > +
> > > > > +	for (i = 0; i < case_nb; i++) {
> > > > > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > > > > +			printf("No test type in test case %d.\n\n", i +
> 1);
> > > > > +			continue;
> > > > > +		}
> > > > > +		if (!test_cases[i].is_valid) {
> > > > > +			printf("Invalid test case %d.\n\n", i + 1);
> > > > > +			continue;
> > > > > +		}
> > > > > +
> > > > > +		cpid = fork();
> > > >
> > > > [Anoob] Do we really need fork()? Can't we use code like,
> > > >
> > > > 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> > > > 			ret |= rte_eal_wait_lcore(lcore_id);
> > > > 		}
> > > >
> > > > to wait for all threads to exit?
> > >
> > > [Cheng] Good question. Fork() is used here to establish a new process
> > > for the new test case. In order for each test case to have a new EAL
> > > environment (for the flexibility), the EAL must be reinitialized for each
> case.
> > > However, the EAL parameters can only be initialized once per process.
> > > Therefore, we use a new process to run each new test case. Moreover,
> > > each test case runs sequentially and does not affect the others,
> > > ensuring the accuracy of the performance data. Your code would wait
> > > for all threads to exit in the same process. However, it would not provide
> a
> > "clean"
> > > environment for each test case like fork() does. Fork() allows us to
> > > have a fully reinitialized environment, with no impact or side effects
> > > from previous test cases. This results in clean, precise performance data
> for
> > each case.
> > >
> > > Please let me know your thoughts on this. And please let me know if
> > > you have any other questions or require any clarification.
> >
> > [Anoob] This was just a generic observation. I do not have a strong opinion
> > either way.
> >
> 
> [Cheng] sure, got it.
> 
> > >
> > > Thanks,
> > > Cheng
> > >
> > > >
> > > > > +		if (cpid < 0) {
> > > > > +			printf("Fork case %d failed.\n", i + 1);
> > > > > +			exit(EXIT_FAILURE);
> > > > > +		} else if (cpid == 0) {
> > > > > +			printf("\nRunning case %u\n\n", i + 1);
> > > > > +
> > > > > +			new_argc = append_eal_args(argc, argv,
> > > > > test_cases[i].eal_args, pargs);
> > > > > +			ret = rte_eal_init(new_argc, pargs);
> > > > > +			if (ret < 0)
> > > > > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > > > > arguments\n");
> > > > > +
> > > > > +			/* Check lcores. */
> > > > > +			nb_lcores = rte_lcore_count();
> > > > > +			if (nb_lcores < 2)
> > > > > +				rte_exit(EXIT_FAILURE,
> > > > > +					"There should be at least 2
> worker
> > > > > lcores.\n");
> > > > > +
> > > > > +			fd = fopen(rst_path_ptr, "a");
> > > > > +			if (!fd) {
> > > > > +				printf("Open output CSV file
> error.\n");
> > > > > +				return 0;
> > > > > +			}
> > > > > +
> > > > > +			if (is_first_case) {
> > > > > +				output_env_info();
> > > > > +				is_first_case = false;
> > > > > +			}
> > > > > +			run_test(i + 1, &test_cases[i]);
> > > > > +
> > > > > +			/* clean up the EAL */
> > > > > +			rte_eal_cleanup();
> > > > > +
> > > > > +			fclose(fd);
> > > > > +
> > > > > +			printf("\nCase %u completed.\n\n", i + 1);
> > > > > +
> > > > > +			exit(EXIT_SUCCESS);
> > > > > +		} else {
> > > > > +			wpid = waitpid(cpid, &wstatus, 0);
> > > > > +			if (wpid == -1) {
> > > > > +				printf("waitpid error.\n");
> > > > > +				exit(EXIT_FAILURE);
> > > > > +			}
> > > > > +
> > > > > +			if (WIFEXITED(wstatus))
> > > > > +				printf("Case process exited. status
> %d\n\n",
> > > > > +					WEXITSTATUS(wstatus));
> > > > > +			else if (WIFSIGNALED(wstatus))
> > > > > +				printf("Case process killed by signal
> %d\n\n",
> > > > > +					WTERMSIG(wstatus));
> > > > > +			else if (WIFSTOPPED(wstatus))
> > > > > +				printf("Case process stopped by
> signal
> > > > > %d\n\n",
> > > > > +					WSTOPSIG(wstatus));
> > > > > +			else if (WIFCONTINUED(wstatus))
> > > > > +				printf("Case process
> continued.\n\n");
> > > > > +			else
> > > > > +				printf("Case process unknown
> > > > > terminated.\n\n");
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	printf("Bye...\n");
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> > > > > new file mode 100644 index 0000000000..215ac42673
> > > > > --- /dev/null
> > > > > +++ b/app/test-dma-perf/main.h
> > > > > @@ -0,0 +1,69 @@
> > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > +
> > > > > +#ifndef _MAIN_H_
> > > > > +#define _MAIN_H_
> > > > > +
> > > > > +
> > > > > +#include <rte_common.h>
> > > > > +#include <rte_cycles.h>
> > > > > +#include <rte_dev.h>
> > > > > +#include <rte_dmadev.h>
> > > > > +
> > > > > +#ifndef __maybe_unused
> > > > > +#define __maybe_unused	__rte_unused
> > > > > +#endif
> > > > > +
> > > > > +#define MAX_WORKER_NB 128
> > > > > +#define MAX_OUTPUT_STR_LEN 512
> > > > > +
> > > > > +#define MAX_DMA_NB 128
> > > > > +#define MAX_LCORE_NB 256
> > > > > +
> > > > > +extern char
> output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > +
> > > > > +typedef enum {
> > > > > +	OP_NONE = 0,
> > > > > +	OP_ADD,
> > > > > +	OP_MUL
> > > > > +} alg_op_type;
> > > > > +
> > > > > +struct test_configure_entry {
> > > > > +	uint32_t first;
> > > > > +	uint32_t last;
> > > > > +	uint32_t incr;
> > > > > +	alg_op_type op;
> > > > > +	uint32_t cur;
> > > > > +};
> > > > > +
> > > > > +struct lcore_dma_map_t {
> > > > > +	uint32_t lcores[MAX_WORKER_NB];
> > > > > +	char
> dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > > > > +	int16_t dma_ids[MAX_WORKER_NB];
> > > > > +	uint16_t cnt;
> > > > > +};
> > > > > +
> > > > > +struct test_configure {
> > > > > +	bool is_valid;
> > > > > +	uint8_t test_type;
> > > > > +	const char *test_type_str;
> > > > > +	uint16_t src_numa_node;
> > > > > +	uint16_t dst_numa_node;
> > > > > +	uint16_t opcode;
> > > > > +	bool is_dma;
> > > > > +	struct lcore_dma_map_t lcore_dma_map;
> > > > > +	struct test_configure_entry mem_size;
> > > > > +	struct test_configure_entry buf_size;
> > > > > +	struct test_configure_entry ring_size;
> > > > > +	struct test_configure_entry kick_batch;
> > > > > +	uint32_t cache_flush;
> > > > > +	uint32_t nr_buf;
> > > > > +	uint16_t test_secs;
> > > > > +	const char *eal_args;
> > > > > +	uint8_t scenario_id;
> > > > > +};
> > > > > +
> > > > > +void mem_copy_benchmark(struct test_configure *cfg, bool
> is_dma);
> > > > > +
> > > > > +#endif /* _MAIN_H_ */
> > > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > > perf/meson.build new file mode 100644 index
> 0000000000..bd6c264002
> > > > > --- /dev/null
> > > > > +++ b/app/test-dma-perf/meson.build
> > > > > @@ -0,0 +1,17 @@
> > > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > > > +Intel Corporation
> > > > > +
> > > > > +# meson file, for building this app as part of a main DPDK build.
> > > > > +
> > > > > +if is_windows
> > > > > +    build = false
> > > > > +    reason = 'not supported on Windows'
> > > > > +    subdir_done()
> > > > > +endif
> > > > > +
> > > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > > +
> > > > > +sources = files(
> > > > > +        'main.c',
> > > > > +        'benchmark.c',
> > > > > +)
> > > > > --
> > > > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-15 15:47           ` Anoob Joseph
@ 2023-06-16  2:56             ` Jiang, Cheng1
  2023-06-16  6:32               ` Anoob Joseph
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-16  2:56 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Thursday, June 15, 2023 11:48 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Please see inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Thursday, June 15, 2023 7:36 PM
> > To: Anoob Joseph <anoobj@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> application
> >
> > Hi Anoob,
> >
> > Replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Thursday, June 15, 2023 4:45 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> > >
> > > Hi Cheng,
> > >
> > > Please see inline.
> > >
> > > Thanks,
> > > Anoob
> > >
> > > > -----Original Message-----
> > > > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > Sent: Thursday, June 15, 2023 1:31 PM
> > > > To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> > Amit
> > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang,
> > > YuanX
> > > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > Hi,
> > > >
> > > > Thanks for your comments, the replies are inline.
> > > >
> > > > Thanks,
> > > > Cheng
> > > >
> > > > > -----Original Message-----
> > > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > > Sent: Thursday, June 15, 2023 1:22 PM
> > > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> > > Amit
> > > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang,
> > > > YuanX
> > > > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > application
> > > > >
> > > > > Hi,
> > > > >
> > > > > Thanks for working on the comments. Few more top level comment
> > > inline.
> > > > >
> > > > > Thanks,
> > > > > Anoob
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > Sent: Tuesday, June 13, 2023 10:02 AM
> > > > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash
> > > > Shukla
> > > > > > <amitprakashs@marvell.com>; Anoob Joseph
> > <anoobj@marvell.com>
> > > > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > > > xingguang.he@intel.com;
> > > > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > > application
> > > > > >
> > > > > > External Email
> > > > > >
> > > > > > ------------------------------------------------------------------
> > > > > > --
> > > > > > -- There are many high-performance DMA devices supported in
> DPDK
> > > > > > now, and these DMA devices can also be integrated into other
> > > > > > modules of DPDK as accelerators, such as Vhost. Before integrating
> > > > > > DMA into applications, developers need to know the performance
> of
> > > > > > these DMA devices in various scenarios and the performance of
> CPUs
> > > > > > in the same scenario, such as different buffer lengths. Only in
> > > > > > this way can we know the target performance of the application
> > > > > > accelerated by using them. This patch introduces a
> > > > > > high-performance testing tool, which supports comparing the
> > > > > > performance of CPU and DMA in different scenarios automatically
> > > > > > with a pre- set config file. Memory Copy performance test are
> > > > > supported for now.
> > > > > >
> > > > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > > > ---
> > > > > > v6:
> > > > > >   improved code based on Anoob's comments;
> > > > > >   fixed some code structure issues;
> > > > > > v5:
> > > > > >   fixed some LONG_LINE warnings;
> > > > > > v4:
> > > > > >   fixed inaccuracy of the memory footprint display;
> > > > > > v3:
> > > > > >   fixed some typos;
> > > > > > v2:
> > > > > >   added lcore/dmadev designation;
> > > > > >   added error case process;
> > > > > >   removed worker_threads parameter from config.ini;
> > > > > >   improved the logs;
> > > > > >   improved config file;
> > > > > >
> > > > > >  app/meson.build               |   1 +
> > > > > >  app/test-dma-perf/benchmark.c | 477
> > > > ++++++++++++++++++++++++++++
> > > > > > app/test-dma-perf/config.ini  |  59 ++++
> > > > > >  app/test-dma-perf/main.c      | 569
> > > > > > ++++++++++++++++++++++++++++++++++
> > > > > >  app/test-dma-perf/main.h      |  69 +++++
> > > > > >  app/test-dma-perf/meson.build |  17 +
> > > > > >  6 files changed, 1192 insertions(+)  create mode 100644
> > > > > > app/test-dma-perf/benchmark.c  create mode 100644
> > > > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > > > > mode
> > > > > > 100644 app/test-dma-perf/meson.build
> > > > > >
> > > > > > diff --git a/app/meson.build b/app/meson.build index
> > > > > > 74d2420f67..4fc1a83eba 100644
> > > > > > --- a/app/meson.build
> > > > > > +++ b/app/meson.build
> > > > > > @@ -19,6 +19,7 @@ apps = [
> > > > > >          'test-cmdline',
> > > > > >          'test-compress-perf',
> > > > > >          'test-crypto-perf',
> > > > > > +        'test-dma-perf',
> > > > > >          'test-eventdev',
> > > > > >          'test-fib',
> > > > > >          'test-flow-perf',
> > > > > > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > > > > > perf/benchmark.c new file mode 100644 index
> > 0000000000..bc1ca82297
> > > > > > --- /dev/null
> > > > > > +++ b/app/test-dma-perf/benchmark.c
> > > > > > @@ -0,0 +1,477 @@
> > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > +
> > > > > > +#include <inttypes.h>
> > > > > > +#include <stdio.h>
> > > > > > +#include <stdlib.h>
> > > > > > +#include <unistd.h>
> > > > > > +
> > > > > > +#include <rte_time.h>
> > > > > > +#include <rte_mbuf.h>
> > > > > > +#include <rte_dmadev.h>
> > > > > > +#include <rte_malloc.h>
> > > > > > +#include <rte_lcore.h>
> > > > > > +
> > > > > > +#include "main.h"
> > > > > > +
> > > > > > +#define MAX_DMA_CPL_NB 255
> > > > > > +
> > > > > > +#define TEST_WAIT_U_SECOND 10000
> > > > > > +
> > > > > > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%"
> > > > PRIu64
> > > > > > ",%.3lf,%.3lf\n"
> > > > > > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%"
> > > > PRIu64
> > > > > > ",%.3lf,%.3lf\n"
> > > > > > +
> > > > > > +struct worker_info {
> > > > > > +	bool ready_flag;
> > > > > > +	bool start_flag;
> > > > > > +	bool stop_flag;
> > > > > > +	uint32_t total_cpl;
> > > > > > +	uint32_t test_cpl;
> > > > > > +};
> > > > > > +
> > > > > > +struct lcore_params {
> > > > > > +	uint8_t scenario_id;
> > > > > > +	unsigned int lcore_id;
> > > > > > +	char *dma_name;
> > > > > > +	uint16_t worker_id;
> > > > > > +	uint16_t dev_id;
> > > > > > +	uint32_t nr_buf;
> > > > > > +	uint16_t kick_batch;
> > > > > > +	uint32_t buf_size;
> > > > > > +	uint16_t test_secs;
> > > > > > +	struct rte_mbuf **srcs;
> > > > > > +	struct rte_mbuf **dsts;
> > > > > > +	struct worker_info worker_info;
> > > > > > +};
> > > > > > +
> > > > > > +static struct rte_mempool *src_pool; static struct rte_mempool
> > > > > > +*dst_pool;
> > > > > > +
> > > > > > +static volatile struct lcore_params
> > > > *worker_params[MAX_WORKER_NB];
> > > > > > +
> > > > > > +#define PRINT_ERR(...) print_err(__func__, __LINE__,
> > __VA_ARGS__)
> > > > > > +
> > > > > > +static inline int
> > > > > > +__rte_format_printf(3, 4)
> > > > > > +print_err(const char *func, int lineno, const char *format, ...) {
> > > > > > +	va_list ap;
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > > > > > +	va_start(ap, format);
> > > > > > +	ret += vfprintf(stderr, format, ap);
> > > > > > +	va_end(ap);
> > > > > > +
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static inline void
> > > > > > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t
> > > > > > +nb_workers,
> > > > > > uint16_t test_secs,
> > > > > > +				uint32_t total_cnt, float *memory,
> > uint32_t
> > > > > > *ave_cycle,
> > > > > > +				float *bandwidth, float *mops) {
> > > > > > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) /
> > (1024
> > > > > > +*
> > > > > > 1024);
> > > > > > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > > > > > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > > > > > (float)*ave_cycle)) / 1000000000;
> 
> [Anoob] The above calculation may not yield actual results. 'ave_cycle' would
> get converted to integer and then bandwidth would be allowed to report
> only very few values. Instead, we can do the calculation directly like,
> 
> 	*bandwidth = ((float)buf_size * 8 * total_cnt / test_secs) /
> 1000000000;
> 	*mops = (float)total_cnt / test_secs / 1000000;
> 
> Same issue is there with below calculation as well. Please check.

[Cheng] Yes, I've noticed as well. Dengdui also mentioned this in his comments. I will address this issue in v7. Thank you very much.

> 
> Side note: in bandwidth calculation, shouldn't we be dividing by
> 1024*1024*1024? I've just carried the calculation that you used. Feel free to
> correct as required.

[Cheng] The unit I'm using in my calculations is Gb/s (Gigabits per second), which is based on the decimal system. Therefore, I use the factor of 1000^3 (or 1,000,000,000).
The method you mentioned, dividing by 1024^3, is typically used when calculating GiB/s (Gibibits per second), a binary-based unit.
I think both methods are acceptable as long as the units and calculation methods correspond.
What do you think?

> 
> > > > > > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > > > > > +
> > > > > > +static void
> > > > > > +output_result(uint8_t scenario_id, uint32_t lcore_id, char
> > > > > > +*dma_name,
> > > > > > uint64_t ave_cycle,
> > > > > > +			uint32_t buf_size, uint32_t nr_buf, float
> > memory,
> > > > > > +			float bandwidth, float mops, bool is_dma) {
> > > > > > +	if (is_dma)
> > > > > > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > > > > > +	else
> > > > > > +		printf("lcore %u\n", lcore_id);
> > > > > > +
> > > > > > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u,
> > nr_buf:
> > > > > > +%u,
> > > > > > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > > > > > +			ave_cycle, buf_size, nr_buf, memory,
> > > > > > rte_get_timer_hz());
> > > > > > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n",
> > bandwidth,
> > > > > > +mops);
> > > > > > +
> > > > > > +	if (is_dma)
> > > > > > +		snprintf(output_str[lcore_id],
> > MAX_OUTPUT_STR_LEN,
> > > > > > CSV_LINE_DMA_FMT,
> > > > > > +			scenario_id, lcore_id, dma_name, buf_size,
> > > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> > mops);
> > > > > > +	else
> > > > > > +		snprintf(output_str[lcore_id],
> > MAX_OUTPUT_STR_LEN,
> > > > > > CSV_LINE_CPU_FMT,
> > > > > > +			scenario_id, lcore_id, buf_size,
> > > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> > mops); }
> > > > > > +
> > > > > > +static inline void
> > > > > > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > > > > > +		__maybe_unused uint32_t buf_size,
> > > > > > +		__maybe_unused uint32_t nr_buf) { #ifdef
> > > RTE_ARCH_X86_64
> > > > > > +	char *data;
> > > > > > +	struct rte_mbuf **srcs = array;
> > > > > > +	uint32_t i, offset;
> > > > > > +
> > > > > > +	for (i = 0; i < nr_buf; i++) {
> > > > > > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > > > > > +		for (offset = 0; offset < buf_size; offset += 64)
> > > > > > +			__builtin_ia32_clflush(data + offset);
> > > > > > +	}
> > > > > > +#endif
> > > > > > +}
> > > > > > +
> > > > > > +/* Configuration of device. */
> > > > > > +static void
> > > > > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > > > > > +	uint16_t vchan = 0;
> > > > > > +	struct rte_dma_info info;
> > > > > > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > > > > +	struct rte_dma_vchan_conf qconf = {
> > > > > > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > > > > +		.nb_desc = ring_size
> > > > > > +	};
> > > > > > +
> > > > > > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma
> > configure.\n");
> > > > > > +
> > > > > > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > > > > > +		rte_exit(EXIT_FAILURE, "Error with queue
> > configuration.\n");
> > > > > > +
> > > > > > +	rte_dma_info_get(dev_id, &info);
> > > > > > +	if (info.nb_vchans != 1)
> > > > > > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > > > > > reported on device id. %u\n",
> > > > > > +				dev_id);
> > > > > > +
> > > > > > +	if (rte_dma_start(dev_id) != 0)
> > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > > > > > +
> > > > > > +static int
> > > > > > +config_dmadevs(struct test_configure *cfg) {
> > > > > > +	uint32_t ring_size = cfg->ring_size.cur;
> > > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > > +	uint32_t nb_workers = ldm->cnt;
> > > > > > +	uint32_t i;
> > > > > > +	int dev_id;
> > > > > > +	uint16_t nb_dmadevs = 0;
> > > > > > +	char *dma_name;
> > > > > > +
> > > > > > +	for (i = 0; i < ldm->cnt; i++) {
> > > > > > +		dma_name = ldm->dma_names[i];
> > > > > > +		dev_id =
> > rte_dma_get_dev_id_by_name(dma_name);
> > > > > > +		if (dev_id == -1) {
> > > > > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > > > > dma_name);
> > > > > > +			goto end;
> > > > > > +		}
> > > > > > +
> > > > > > +		ldm->dma_ids[i] = dev_id;
> > > > > > +		configure_dmadev_queue(dev_id, ring_size);
> > > > > > +		++nb_dmadevs;
> > > > > > +	}
> > > > > > +
> > > > > > +end:
> > > > > > +	if (nb_dmadevs < nb_workers) {
> > > > > > +		printf("Not enough dmadevs (%u) for all workers
> > (%u).\n",
> > > > > > nb_dmadevs, nb_workers);
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +#define POLL_MAX 1000
> > > > > > +
> > > > > > +
> > > > >
> > > > > [Anoob] Extra blank line. You can consider removing.
> > > >
> > > > [Cheng] sure, sorry for the miss.
> > > >
> > > > >
> > > > > > +static inline void
> > > > > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > > > > > +			volatile struct worker_info *worker_info) {
> > > > > > +	int ret;
> > > > > > +	uint16_t nr_cpl;
> > > > > > +
> > > > > > +	ret = rte_dma_submit(dev_id, 0);
> > > > > > +	if (ret < 0) {
> > > > > > +		rte_dma_stop(dev_id);
> > > > > > +		rte_dma_close(dev_id);
> > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > > > > +	}
> > > > > > +
> > > > > > +	nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL,
> > > > > > NULL);
> > > > > > +	*async_cnt -= nr_cpl;
> > > > > > +	worker_info->total_cpl += nr_cpl; }
> > > > > > +
> > > > > > +static inline int
> > > > > > +do_dma_mem_copy(void *p)
> > > > > > +{
> > > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > > +	volatile struct lcore_params *para =
> > worker_params[*para_idx];
> > > > > > +	volatile struct worker_info *worker_info = &(para-
> > >worker_info);
> > > > > > +	const uint16_t dev_id = para->dev_id;
> > > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > > +	const uint16_t kick_batch = para->kick_batch;
> > > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > > +	uint16_t nr_cpl;
> > > > > > +	uint64_t async_cnt = 0;
> > > > > > +	uint32_t i;
> > > > > > +	uint32_t poll_cnt = 0;
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	worker_info->stop_flag = false;
> > > > > > +	worker_info->ready_flag = true;
> > > > > > +
> > > > > > +	while (!worker_info->start_flag)
> > > > > > +		;
> > > > > > +
> > > > > > +	while (1) {
> > > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > > +dma_copy:
> > > > > > +			ret = rte_dma_copy(dev_id, 0,
> > > > > > rte_pktmbuf_iova(srcs[i]),
> > > > > > +				rte_pktmbuf_iova(dsts[i]), buf_size,
> > 0);
> > > > > > +			if (unlikely(ret < 0)) {
> > > > > > +				if (ret == -ENOSPC) {
> > > > > > +
> > 	do_dma_submit_and_poll(dev_id,
> > > > > > &async_cnt, worker_info);
> > > > > > +					goto dma_copy;
> > > > > > +				} else {
> > > > > > +					/* Error exit */
> > > > > > +					rte_dma_stop(dev_id);
> > > > > > +					rte_exit(EXIT_FAILURE,
> > "DMA
> > > > > > enqueue failed\n");
> > > > > > +				}
> > > > > > +			}
> > > > > > +			async_cnt++;
> > > > > > +
> > > > > > +			if ((async_cnt % kick_batch) == 0)
> > > > > > +				do_dma_submit_and_poll(dev_id,
> > > > > > &async_cnt, worker_info);
> > > > > > +		}
> > > > > > +
> > > > > > +		if (worker_info->stop_flag)
> > > > > > +			break;
> > > > > > +	}
> > > > > > +
> > > > > > +	rte_dma_submit(dev_id, 0);
> > > > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > > > +		async_cnt -= nr_cpl;
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static inline int
> > > > > > +do_cpu_mem_copy(void *p)
> > > > > > +{
> > > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > > +	volatile struct lcore_params *para =
> > worker_params[*para_idx];
> > > > > > +	volatile struct worker_info *worker_info = &(para-
> > >worker_info);
> > > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > > +	uint32_t i;
> > > > > > +
> > > > > > +	worker_info->stop_flag = false;
> > > > > > +	worker_info->ready_flag = true;
> > > > > > +
> > > > > > +	while (!worker_info->start_flag)
> > > > > > +		;
> > > > > > +
> > > > > > +	while (1) {
> > > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > > +			/* copy buffer form src to dst */
> > > > > > +			rte_memcpy((void
> > > > > > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > > > > > +				(void
> > > > > > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > > > > > +				(size_t)buf_size);
> > > > > > +			worker_info->total_cpl++;
> > > > > > +		}
> > > > > > +		if (worker_info->stop_flag)
> > > > > > +			break;
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int
> > > > > > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf
> > > > ***srcs,
> > > > > > +			struct rte_mbuf ***dsts)
> > > > > > +{
> > > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > > +	unsigned int nr_sockets;
> > > > > > +	uint32_t nr_buf = cfg->nr_buf;
> > > > > > +
> > > > > > +	nr_sockets = rte_socket_count();
> > > > > > +	if (cfg->src_numa_node >= nr_sockets ||
> > > > > > +		cfg->dst_numa_node >= nr_sockets) {
> > > > > > +		printf("Error: Source or destination numa exceeds
> > the acture
> > > > > > numa nodes.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	src_pool =
> > rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > > > > > +			nr_buf, /* n == num elements */
> > > > > > +			64,  /* cache size */
> > > > > > +			0,   /* priv size */
> > > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > > +			cfg->src_numa_node);
> > > > > > +	if (src_pool == NULL) {
> > > > > > +		PRINT_ERR("Error with source mempool
> > creation.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	dst_pool =
> > rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > > > > > +			nr_buf, /* n == num elements */
> > > > > > +			64,  /* cache size */
> > > > >
> > > > > [Anoob] We do not alloc or free pointers in the datapath, right? So
> > > > > why bother with cache?
> > > >
> > > > [Cheng] Yes, you are right, the cache size is not necessary here, I'll
> > > > fix it in the next version.
> > > >
> > > > >
> > > > > > +			0,   /* priv size */
> > > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > > +			cfg->dst_numa_node);
> > > > > > +	if (dst_pool == NULL) {
> > > > > > +		PRINT_ERR("Error with destination mempool
> > creation.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> > 0);
> > > > > > +	if (*srcs == NULL) {
> > > > > > +		printf("Error: srcs malloc failed.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > >
> > > > > [Anoob] Are we freeing these memory? The ones allocated with
> > > > rte_malloc.
> > > >
> > > > [Cheng] yes, we freed the memory in the end of
> > mem_copy_benchmark()
> > > > when we finished the test.
> > >
> > > [Anoob] I think we are not freeing this mem. In the place where we free
> all
> > > mem, we do free all objects to mempool as well as the mempools. But
> this
> > > memory is to hold the pointers, right? Is that getting freed anywhere?
> > >
> > > Also, in the mem clearing paths, do we need to clear the static variables
> (ie,
> > > set srcs, src_pool, dsts, dst_pool to NULL) so that there won't be any
> scope
> > > for any double free.
> > >
> >
> > [Cheng] My apologies for the misunderstanding earlier. I now understand
> > your point that you are right, the memory used to store the pointers is not
> > being freed. I will fix this issue in the next version. Regarding the static
> > variables you mentioned, I agree with your view that they should be
> cleared.
> > I will address this in the upcoming version as well. Thank you very much for
> > the feedback. It is greatly appreciated.
> >
> > In addition, I think we also need to nullify these variables when initializing
> > them to ensure safety and standardization of use. What do you think?
> 
> [Anoob] Since these are static variables, it is probably okay to skip the init
> part. But when we use it, we should clear it after use.
> 
> Please check above. I've posted one more comment. In case you missed.
> 

[Cheng] sure, thanks for your advice, I'll clear it after use in the next version, thanks.

> >
> > Thanks!
> >
> > > >
> > > > >
> > > > > > +
> > > > > > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> > 0);
> > > > > > +	if (*dsts == NULL) {
> > > > > > +		printf("Error: dsts malloc failed.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf)
> > != 0) {
> > > > > > +		printf("get src mbufs failed.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf)
> > != 0) {
> > > > > > +		printf("get dst mbufs failed.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +void
> > > > > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > > > > +	uint16_t i;
> > > > > > +	uint32_t offset;
> > > > > > +	unsigned int lcore_id = 0;
> > > > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > > > +1024) /
> > > > > > (cfg->buf_size.cur * 2);
> > > > > > +	uint16_t nb_workers = ldm->cnt;
> > > > > > +	uint16_t test_secs = cfg->test_secs;
> > > > > > +	float memory;
> > > > > > +	uint32_t avg_cycles = 0;
> > > > > > +	float mops;
> > > > > > +	float bandwidth;
> > > > > > +
> > > > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (is_dma)
> > > > > > +		if (config_dmadevs(cfg) < 0)
> > > > > > +			goto out;
> > > > > > +
> > > > > > +	if (cfg->cache_flush) {
> > > > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > > > +		rte_mb();
> > > > > > +	}
> > > > > > +
> > > > > > +	printf("Start testing....\n");
> > > > > > +
> > > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > > +		lcore_id = ldm->lcores[i];
> > > > > > +		offset = nr_buf / nb_workers * i;
> > > > > > +
> > > > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > > > lcore_params), 0);
> > > > > > +		if (!worker_params[i]) {
> > > > > > +			printf("lcore parameters malloc failure for
> > lcore
> > > > > > %d\n", lcore_id);
> > > > > > +			break;
> > > > > > +		}
> > > > >
> > > > > [Anoob] Are we freeing the above memory?
> > > >
> > > > [Cheng] sorry, I missed that, I'll add worker_params memory free in
> > > > the next version, thanks.
> > > >
> > > > >
> > > > > > +		if (is_dma) {
> > > > > > +			worker_params[i]->dma_name = ldm-
> > > > > > >dma_names[i];
> > > > > > +			worker_params[i]->dev_id = ldm-
> > >dma_ids[i];
> > > > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > > > +		}
> > > > > > +		worker_params[i]->worker_id = i;
> > > > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > > > nb_workers);
> > > > > > +		worker_params[i]->buf_size = buf_size;
> > > > > > +		worker_params[i]->test_secs = test_secs;
> > > > > > +		worker_params[i]->srcs = srcs + offset;
> > > > > > +		worker_params[i]->dsts = dsts + offset;
> > > > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > > > +
> > > > > > +		if (is_dma)
> > > > > > +
> > 	rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > > > *)(&i), lcore_id);
> > > > > > +		else
> > > > > > +			rte_eal_remote_launch(do_cpu_mem_copy,
> > (void
> > > > > > *)(&i), lcore_id);
> > > > > > +	}
> > > > > > +
> > > > > > +	while (1) {
> > > > > > +		bool ready = true;
> > > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > > +			if (worker_params[i]-
> > >worker_info.ready_flag ==
> > > > > > false) {
> > > > > > +				ready = 0;
> > > > > > +				break;
> > > > > > +			}
> > > > > > +		}
> > > > > > +		if (ready)
> > > > > > +			break;
> > > > > > +	}
> > > > > > +
> > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > > > +
> > > > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > > +worker_params[i]->worker_info.total_cpl;
> > > > > > +
> > > > > > +	usleep(test_secs * 1000 * 1000);
> > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > > worker_params[i]->worker_info.total_cpl -
> > > > > > +						worker_params[i]-
> > > > > > >worker_info.test_cpl;
> > > > > > +
> > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > > > +
> > > > > > +	rte_eal_mp_wait_lcore();
> > > > > > +
> > > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > > > +			worker_params[i]->worker_info.test_cpl,
> > > > > > +			&memory, &avg_cycles, &bandwidth,
> > &mops);
> > > > > > +		output_result(cfg->scenario_id, worker_params[i]-
> > >lcore_id,
> > > > > > +					worker_params[i]-
> > >dma_name,
> > > > > > avg_cycles, buf_size,
> > > > > > +					nr_buf / nb_workers,
> > memory,
> > > > > > bandwidth, mops, is_dma);
> > > > > > +	}
> > > > > > +
> > > > > > +out:
> > > > > > +	/* free env */
> > > > > > +	if (srcs)
> > > > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > > > +	if (dsts)
> > > > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > > > +
> > > > > > +	if (src_pool)
> > > > > > +		rte_mempool_free(src_pool);
> > > > > > +	if (dst_pool)
> > > > > > +		rte_mempool_free(dst_pool);
> > > > > > +
> > > > > > +	if (is_dma) {
> > > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > > +			printf("Stopping dmadev %d\n", ldm-
> > >dma_ids[i]);
> > > > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > > > > > +		}
> > > > > > +	}
> > > > > > +}
> > > > > > diff --git a/app/test-dma-perf/config.ini
> > > > > > b/app/test-dma-perf/config.ini new file mode 100644 index
> > > > > > 0000000000..2fd9c3c387
> > > > > > --- /dev/null
> > > > > > +++ b/app/test-dma-perf/config.ini
> > > > > > @@ -0,0 +1,59 @@
> > > > > > +
> > > > > > +; This is an example configuration file for dma-perf, which
> > > > > > +details the meanings of each parameter ; and instructions on how
> > > > > > +to use dma-
> > > > perf.
> > > > > > +
> > > > > > +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
> > > > > > +
> > > > > > +; Parameters:
> > > > > > +; "mem_size" denotes the size of the memory footprint.
> > > > > > +; "buf_size" denotes the memory size of a single operation.
> > > > > > +; "dma_ring_size" denotes the dma ring buffer size. It should be
> > > > > > +greater
> > > > > > than 64 normally.
> > > > > > +; "kick_batch" denotes the dma operation batch size, and should
> > > > > > +be greater
> > > > > > than 1 normally.
> > > > > > +
> > > > > > +; The format for variables is variable=first,last,increment,ADD|MUL.
> > > > > > +
> > > > > > +; src_numa_node is used to control the numa node where the
> > source
> > > > > > memory is allocated.
> > > > > > +; dst_numa_node is used to control the numa node where the
> > > > > > +destination
> > > > > > memory is allocated.
> > > > > > +
> > > > > > +; cache_flush is used to determine whether or not the cache
> > > > > > +should be flushed, with 1 indicating to ; flush and 0 indicating to
> not
> > > flush.
> > > > > > +
> > > > > > +; test_seconds controls the test time of the whole case.
> > > > > > +
> > > > > > +; To use DMA for a test, please specify the "lcore_dma" parameter.
> > > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > > +; make sure that the value of "lcore_dma" falls within their
> > > > > > +range of the
> > > > > > values.
> > > > > > +
> > > > > > +; To use CPU for a test, please specify the "lcore" parameter.
> > > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > > +; make sure that the value of "lcore" falls within their range of
> > values.
> > > > > > +
> > > > > > +; To specify a configuration file, use the "--config" flag
> > > > > > +followed by the path
> > > > > > to the file.
> > > > > > +
> > > > > > +; To specify a result file, use the "--result" flag followed by
> > > > > > +the path to the
> > > > > > file.
> > > > > > +; If you do not specify a result file, one will be generated with
> > > > > > +the same name as the configuration ; file, with the addition of
> > > > > > +"_result.csv" at
> > > > > > the end.
> > > > > > +
> > > > > > +[case1]
> > > > > > +type=DMA_MEM_COPY
> > > > > > +mem_size=10
> > > > > > +buf_size=64,8192,2,MUL
> > > > > > +dma_ring_size=1024
> > > > > > +kick_batch=32
> > > > > > +src_numa_node=0
> > > > > > +dst_numa_node=0
> > > > > > +cache_flush=0
> > > > > > +test_seconds=2
> > > > > > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > > > >
> > > > > [Anoob] Isn't it better if we allow user to specify DMA dev ID
> > > > > rather than the PCI DBDF?
> > > > >
> > > > > In the long run, I would expect config file to provide {core,
> > > > > dma_dev_id, queue_id}
> > > > >
> > > > > Another thought is why to expose this at all? If we can restrict
> > > > > this perf application to have one thread only use one vchan, then
> > > > > application can easily create this mapping in run time. Unless you
> > > > > want one thread to use 2 different vchans which may not be desirable
> > > > since this is a standalone perf app.
> > > >
> > > > [Cheng] Thank you for the feedback.
> > > > Here are my thoughts:
> > > > Firstly, the user may not know which device the DMA dev ID
> corresponds
> > > > to, or which NUMA node it is on. In my example, I used the CBDMA
> > > > environment, so I did not specify the work queue ID. When using DSA,
> > > > the configuration would be something like lcore10@0000:00:04.2-q0
> > > > which contains core, dma and work queue id. The reason for exposing
> > > > these options is that we want the user to fully understand which cores
> > > > and devices are being used so that they know exactly where the
> > > > performance data is coming from. For example, performance when
> cores
> > > > and DMA devices are not on the same NUMA node, etc. This allows the
> > > > testing scenario to be precise and flexible. If the application
> > > > handles the mapping itself, the user loses control over the mapping
> > > > and may not get the performance data they want. We believe control
> > > > should be given to the user rather than the application.
> > >
> > > [Anoob] I understand your view points. Thanks for the explanation.
> > >
> >
> > [Cheng] sure, no problem.
> >
> > > >
> > > > >
> > > > > > +eal_args=--in-memory --file-prefix=test
> > > > > > +
> > > > > > +[case2]
> > > > > > +type=CPU_MEM_COPY
> > > > > > +mem_size=10
> > > > > > +buf_size=64,8192,2,MUL
> > > > > > +src_numa_node=0
> > > > > > +dst_numa_node=1
> > > > > > +cache_flush=0
> > > > > > +test_seconds=2
> > > > > > +lcore = 3, 4
> > > > > > +eal_args=--in-memory --no-pci
> > > > > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > > > > > new file mode 100644 index 0000000000..d65655b87b
> > > > > > --- /dev/null
> > > > > > +++ b/app/test-dma-perf/main.c
> > > > > > @@ -0,0 +1,569 @@
> > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > +
> > > > > > +#include <stdio.h>
> > > > > > +#include <stdlib.h>
> > > > > > +#include <getopt.h>
> > > > > > +#include <signal.h>
> > > > > > +#include <stdbool.h>
> > > > > > +#include <unistd.h>
> > > > > > +#include <sys/wait.h>
> > > > > > +#include <inttypes.h>
> > > > > > +#include <libgen.h>
> > > > > > +
> > > > > > +#include <rte_eal.h>
> > > > > > +#include <rte_cfgfile.h>
> > > > > > +#include <rte_string_fns.h>
> > > > > > +#include <rte_lcore.h>
> > > > > > +
> > > > > > +#include "main.h"
> > > > > > +
> > > > > > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > > > > > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > > > > > +
> > > > > > +#define MAX_EAL_PARAM_NB 100
> > > > > > +#define MAX_EAL_PARAM_LEN 1024
> > > > > > +
> > > > > > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > > > > > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > > > > > +
> > > > > > +#define CMDLINE_CONFIG_ARG "--config"
> > > > > > +#define CMDLINE_RESULT_ARG "--result"
> > > > > > +
> > > > > > +#define MAX_PARAMS_PER_ENTRY 4
> > > > > > +
> > > > > > +#define MAX_LONG_OPT_SZ 64
> > > > > > +
> > > > > > +enum {
> > > > > > +	TEST_TYPE_NONE = 0,
> > > > > > +	TEST_TYPE_DMA_MEM_COPY,
> > > > > > +	TEST_TYPE_CPU_MEM_COPY
> > > > > > +};
> > > > > > +
> > > > > > +#define MAX_TEST_CASES 16
> > > > > > +static struct test_configure test_cases[MAX_TEST_CASES];
> > > > > > +
> > > > > > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > > +
> > > > > > +static FILE *fd;
> > > > > > +
> > > > > > +static void
> > > > > > +output_csv(bool need_blankline)
> > > > > > +{
> > > > > > +	uint32_t i;
> > > > > > +
> > > > > > +	if (need_blankline) {
> > > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > > +	}
> > > > > > +
> > > > > > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > > > > > +		if (output_str[i][0]) {
> > > > > > +			fprintf(fd, "%s", output_str[i]);
> > > > > > +			output_str[i][0] = '\0';
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	fflush(fd);
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +output_env_info(void)
> > > > > > +{
> > > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > > > > > environment:\n");
> > > > > > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU
> > frequency,%"
> > > > > > +			PRIu64 "\n", rte_get_timer_hz());
> > > > > > +
> > > > > > +	output_csv(true);
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> > > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > > > > > +			CSV_HDR_FMT, case_id, case_cfg-
> > >test_type_str);
> > > > > > +
> > > > > > +	output_csv(true);
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +run_test_case(struct test_configure *case_cfg) {
> > > > > > +	switch (case_cfg->test_type) {
> > > > > > +	case TEST_TYPE_DMA_MEM_COPY:
> > > > > > +		mem_copy_benchmark(case_cfg, true);
> > > > > > +		break;
> > > > > > +	case TEST_TYPE_CPU_MEM_COPY:
> > > > > > +		mem_copy_benchmark(case_cfg, false);
> > > > > > +		break;
> > > > > > +	default:
> > > > > > +		printf("Unknown test type. %s\n", case_cfg-
> > >test_type_str);
> > > > > > +		break;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > > > > > +	uint32_t i;
> > > > > > +	uint32_t nb_lcores = rte_lcore_count();
> > > > > > +	struct test_configure_entry *mem_size = &case_cfg-
> > >mem_size;
> > > > > > +	struct test_configure_entry *buf_size = &case_cfg-
> > >buf_size;
> > > > > > +	struct test_configure_entry *ring_size = &case_cfg-
> > >ring_size;
> > > > > > +	struct test_configure_entry *kick_batch = &case_cfg-
> > >kick_batch;
> > > > > > +	struct test_configure_entry dummy = { 0 };
> > > > > > +	struct test_configure_entry *var_entry = &dummy;
> > > > > > +
> > > > > > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > > > > > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > > > > > +
> > > > > > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > > > > > +		printf("Case %u: Not enough lcores.\n", case_id);
> > > > > > +		return;
> > > > > > +	}
> > > > > > +
> > > > > > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > > > > > +
> > > > > > +	if (mem_size->incr != 0)
> > > > > > +		var_entry = mem_size;
> > > > > > +
> > > > > > +	if (buf_size->incr != 0)
> > > > > > +		var_entry = buf_size;
> > > > > > +
> > > > > > +	if (ring_size->incr != 0)
> > > > > > +		var_entry = ring_size;
> > > > > > +
> > > > > > +	if (kick_batch->incr != 0)
> > > > > > +		var_entry = kick_batch;
> > > > > > +
> > > > > > +	case_cfg->scenario_id = 0;
> > > > > > +
> > > > > > +	output_header(case_id, case_cfg);
> > > > > > +
> > > > > > +	for (var_entry->cur = var_entry->first; var_entry->cur <=
> > > > > > +var_entry-
> > > > > > >last;) {
> > > > > > +		case_cfg->scenario_id++;
> > > > > > +		printf("\nRunning scenario %d\n", case_cfg-
> > >scenario_id);
> > > > > > +
> > > > > > +		run_test_case(case_cfg);
> > > > > > +		output_csv(false);
> > > > > > +
> > > > > > +		if (var_entry->op == OP_ADD)
> > > > > > +			var_entry->cur += var_entry->incr;
> > > > > > +		else if (var_entry->op == OP_MUL)
> > > > > > +			var_entry->cur *= var_entry->incr;
> > > > > > +		else
> > > > > > +			break;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > > +static int
> > > > > > +parse_lcore(struct test_configure *test_case, const char *value) {
> > > > > > +	size_t len = strlen(value);
> > > > > > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > > > > > +	strcpy(input, value);
> > > > > > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > > > > > >lcore_dma_map);
> > > > > > +
> > > > > > +	if (test_case == NULL || value == NULL)
> > > > > > +		return -1;
> > > > > > +
> > > > > > +	memset(lcore_dma_map, 0, sizeof(struct
> > lcore_dma_map_t));
> > > > > > +
> > > > > > +	char *token = strtok(input, ", ");
> > > > > > +	while (token != NULL) {
> > > > > > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > > > > > +			free(input);
> > > > > > +			return -1;
> > > > > > +		}
> > > > > > +
> > > > > > +		uint16_t lcore_id = atoi(token);
> > > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] =
> > lcore_id;
> > > > > > +
> > > > > > +		token = strtok(NULL, ", ");
> > > > > > +	}
> > > > > > +
> > > > > > +	free(input);
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int
> > > > > > +parse_lcore_dma(struct test_configure *test_case, const char
> > *value)
> > > {
> > > > > > +	struct lcore_dma_map_t *lcore_dma_map;
> > > > > > +	char *input = strndup(value, strlen(value) + 1);
> > > > > > +	char *addrs = input;
> > > > > > +	char *ptrs[2];
> > > > > > +	char *start, *end, *substr;
> > > > > > +	uint16_t lcore_id;
> > > > > > +	int ret = 0;
> > > > > > +
> > > > > > +	while (*addrs == '\0')
> > > > > > +		addrs++;
> > > > > > +	if (*addrs == '\0') {
> > > > > > +		fprintf(stderr, "No input DMA addresses\n");
> > > > > > +		ret = -1;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +
> > > > > > +	substr = strtok(addrs, ",");
> > > > > > +	if (substr == NULL) {
> > > > > > +		fprintf(stderr, "No input DMA address\n");
> > > > > > +		ret = -1;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +
> > > > > > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > > > > > lcore_dma_map_t));
> > > > > > +
> > > > > > +	do {
> > > > > > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > > > > > +
> > > > > > +		start = strstr(ptrs[0], "lcore");
> > > > > > +		if (start == NULL) {
> > > > > > +			fprintf(stderr, "Illegal lcore\n");
> > > > > > +			ret = -1;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +
> > > > > > +		start += 5;
> > > > > > +		lcore_id = strtol(start, &end, 0);
> > > > > > +		if (end == start) {
> > > > > > +			fprintf(stderr, "No input lcore ID or ID %d is
> > > > > > wrong\n", lcore_id);
> > > > > > +			ret = -1;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +
> > > > > > +		lcore_dma_map = &test_case->lcore_dma_map;
> > > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt] =
> > lcore_id;
> > > > > > +		strcpy(lcore_dma_map-
> > >dma_names[lcore_dma_map-
> > > > > > >cnt], ptrs[1]);
> > > > > > +		lcore_dma_map->cnt++;
> > > > > > +		substr = strtok(NULL, ",");
> > > > > > +	} while (substr != NULL);
> > > > > > +
> > > > > > +out:
> > > > > > +	free(input);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int
> > > > > > +parse_entry(const char *value, struct test_configure_entry *entry)
> > {
> > > > > > +	char input[255] = {0};
> > > > > > +	char *args[MAX_PARAMS_PER_ENTRY];
> > > > > > +	int args_nr = -1;
> > > > > > +
> > > > > > +	if (value == NULL || entry == NULL)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	strncpy(input, value, 254);
> > > > > > +	if (*input == '\0')
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	args_nr = rte_strsplit(input, strlen(input), args,
> > > > > > MAX_PARAMS_PER_ENTRY, ',');
> > > > > > +	if (args_nr != 1 && args_nr != 4)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > > > > > +
> > > > > > +	if (args_nr == 4) {
> > > > > > +		entry->last = (uint32_t)atoi(args[1]);
> > > > > > +		entry->incr = (uint32_t)atoi(args[2]);
> > > > > > +		if (!strcmp(args[3], "MUL"))
> > > > > > +			entry->op = OP_MUL;
> > > > > > +		else if (!strcmp(args[3], "ADD"))
> > > > > > +			entry->op = OP_ADD;
> > > > > > +		else {
> > > > > > +			printf("Invalid op %s.\n", args[3]);
> > > > > > +			args_nr = -1;
> > > > > > +		}
> > > > > > +	} else {
> > > > > > +		entry->op = OP_NONE;
> > > > > > +		entry->last = 0;
> > > > > > +		entry->incr = 0;
> > > > > > +	}
> > > > > > +out:
> > > > > > +	return args_nr;
> > > > > > +}
> > > > > > +
> > > > > > +static uint16_t
> > > > > > +load_configs(const char *path)
> > > > > > +{
> > > > > > +	struct rte_cfgfile *cfgfile;
> > > > > > +	int nb_sections, i;
> > > > > > +	struct test_configure *test_case;
> > > > > > +	char section_name[CFG_NAME_LEN];
> > > > > > +	const char *case_type;
> > > > > > +	const char *lcore_dma;
> > > > > > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > > > > > *kick_batch_str;
> > > > > > +	int args_nr, nb_vp;
> > > > > > +	bool is_dma;
> > > > > > +
> > > > > > +	printf("config file parsing...\n");
> > > > > > +	cfgfile = rte_cfgfile_load(path, 0);
> > > > > > +	if (!cfgfile) {
> > > > > > +		printf("Open configure file error.\n");
> > > > > > +		exit(1);
> > > > > > +	}
> > > > > > +
> > > > > > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > > > > > +	if (nb_sections > MAX_TEST_CASES) {
> > > > > > +		printf("Error: The maximum number of cases is
> > %d.\n",
> > > > > > MAX_TEST_CASES);
> > > > > > +		exit(1);
> > > > > > +	}
> > > > > > +
> > > > > > +	for (i = 0; i < nb_sections; i++) {
> > > > > > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i
> > + 1);
> > > > > > +		test_case = &test_cases[i];
> > > > > > +		case_type = rte_cfgfile_get_entry(cfgfile,
> > section_name,
> > > > > > "type");
> > > > > > +		if (!case_type) {
> > > > > > +			printf("Error: No case type in case %d, the
> > test will be
> > > > > > finished here.\n",
> > > > > > +				i + 1);
> > > > > > +			test_case->is_valid = false;
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > > > > > +			test_case->test_type =
> > > > > > TEST_TYPE_DMA_MEM_COPY;
> > > > > > +			test_case->test_type_str =
> > DMA_MEM_COPY;
> > > > > > +			is_dma = true;
> > > > > > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > > > > > +			test_case->test_type =
> > > > > > TEST_TYPE_CPU_MEM_COPY;
> > > > > > +			test_case->test_type_str =
> > CPU_MEM_COPY;
> > > > > > +			is_dma = false;
> > > > > > +		} else {
> > > > > > +			printf("Error: Cannot find case type %s in
> > case%d.\n",
> > > > > > case_type, i + 1);
> > > > > > +			test_case->is_valid = false;
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		nb_vp = 0;
> > > > > > +
> > > > > > +		test_case->src_numa_node =
> > > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > +
> > > > > > 	section_name, "src_numa_node"));
> > > > > > +		test_case->dst_numa_node =
> > > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > +
> > > > > > 	section_name, "dst_numa_node"));
> > > > > > +
> > > > > > +		mem_size_str = rte_cfgfile_get_entry(cfgfile,
> > section_name,
> > > > > > "mem_size");
> > > > > > +		args_nr = parse_entry(mem_size_str, &test_case-
> > > > > > >mem_size);
> > > > > > +		if (args_nr < 0) {
> > > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > > +			test_case->is_valid = false;
> > > > > > +			continue;
> > > > > > +		} else if (args_nr > 1)
> > > > > > +			nb_vp++;
> > > > > > +
> > > > > > +		buf_size_str = rte_cfgfile_get_entry(cfgfile,
> > section_name,
> > > > > > "buf_size");
> > > > > > +		args_nr = parse_entry(buf_size_str, &test_case-
> > >buf_size);
> > > > > > +		if (args_nr < 0) {
> > > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > > +			test_case->is_valid = false;
> > > > > > +			continue;
> > > > > > +		} else if (args_nr > 1)
> > > > > > +			nb_vp++;
> > > > > > +
> > > > > > +		if (is_dma) {
> > > > > > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > > > section_name,
> > > > > > +
> > > > > > 	"dma_ring_size");
> > > > > > +			args_nr = parse_entry(ring_size_str,
> > &test_case-
> > > > > > >ring_size);
> > > > > > +			if (args_nr < 0) {
> > > > > > +				printf("parse error in case %d.\n", i +
> > 1);
> > > > > > +				test_case->is_valid = false;
> > > > > > +				continue;
> > > > > > +			} else if (args_nr > 1)
> > > > > > +				nb_vp++;
> > > > > > +
> > > > > > +			kick_batch_str =
> > rte_cfgfile_get_entry(cfgfile,
> > > > > > section_name, "kick_batch");
> > > > > > +			args_nr = parse_entry(kick_batch_str,
> > &test_case-
> > > > > > >kick_batch);
> > > > > > +			if (args_nr < 0) {
> > > > > > +				printf("parse error in case %d.\n", i +
> > 1);
> > > > > > +				test_case->is_valid = false;
> > > > > > +				continue;
> > > > > > +			} else if (args_nr > 1)
> > > > > > +				nb_vp++;
> > > > > > +
> > > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > > section_name, "lcore_dma");
> > > > > > +			int lcore_ret = parse_lcore_dma(test_case,
> > > > > > lcore_dma);
> > > > > > +			if (lcore_ret < 0) {
> > > > > > +				printf("parse lcore dma error in case
> > %d.\n", i
> > > > > 1);
> > > > > > +				test_case->is_valid = false;
> > > > > > +				continue;
> > > > > > +			}
> > > > > > +		} else {
> > > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > > section_name, "lcore");
> > > > > > +			int lcore_ret = parse_lcore(test_case,
> > lcore_dma);
> > > > > > +			if (lcore_ret < 0) {
> > > > > > +				printf("parse lcore error in case
> > %d.\n", i + 1);
> > > > > > +				test_case->is_valid = false;
> > > > > > +				continue;
> > > > > > +			}
> > > > > > +		}
> > > > > > +
> > > > > > +		if (nb_vp > 1) {
> > > > > > +			printf("Error, each section can only have a
> > single
> > > > > > variable parameter.\n");
> > > > > > +			test_case->is_valid = false;
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		test_case->cache_flush =
> > > > > > +			(int)atoi(rte_cfgfile_get_entry(cfgfile,
> > section_name,
> > > > > > "cache_flush"));
> > > > > > +		test_case->test_secs =
> > > > > > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > +					section_name,
> > "test_seconds"));
> > > > > > +
> > > > > > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > > > > > section_name, "eal_args");
> > > > > > +		test_case->is_valid = true;
> > > > > > +	}
> > > > > > +
> > > > > > +	rte_cfgfile_close(cfgfile);
> > > > > > +	printf("config file parsing complete.\n\n");
> > > > > > +	return i;
> > > > > > +}
> > > > > > +
> > > > > > +/* Parse the argument given in the command line of the
> > > > > > +application */ static int append_eal_args(int argc, char **argv,
> > > > > > +const char *eal_args, char **new_argv) {
> > > > > > +	int i;
> > > > > > +	char *tokens[MAX_EAL_PARAM_NB];
> > > > > > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > > > > > +	int token_nb, new_argc = 0;
> > > > > > +
> > > > > > +	for (i = 0; i < argc; i++) {
> > > > > > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > > > > > +				(strcmp(argv[i],
> > CMDLINE_RESULT_ARG) ==
> > > > > > 0)) {
> > > > > > +			i++;
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +		strlcpy(new_argv[new_argc], argv[i],
> > > > > > sizeof(new_argv[new_argc]));
> > > > > > +		new_argc++;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (eal_args) {
> > > > > > +		strlcpy(args, eal_args, sizeof(args));
> > > > > > +		token_nb = rte_strsplit(args, strlen(args),
> > > > > > +					tokens,
> > MAX_EAL_PARAM_NB, ' ');
> > > > > > +		for (i = 0; i < token_nb; i++)
> > > > > > +			strcpy(new_argv[new_argc++], tokens[i]);
> > > > > > +	}
> > > > > > +
> > > > > > +	return new_argc;
> > > > > > +}
> > > > > > +
> > > > > > +int
> > > > > > +main(int argc, char *argv[])
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +	uint16_t case_nb;
> > > > > > +	uint32_t i, nb_lcores;
> > > > > > +	pid_t cpid, wpid;
> > > > > > +	int wstatus;
> > > > > > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > > > > > +	char *pargs[MAX_EAL_PARAM_NB];
> > > > > > +	char *cfg_path_ptr = NULL;
> > > > > > +	char *rst_path_ptr = NULL;
> > > > > > +	char rst_path[PATH_MAX];
> > > > > > +	int new_argc;
> > > > > > +	bool is_first_case = true;
> > > > > > +
> > > > > > +	memset(args, 0, sizeof(args));
> > > > > > +
> > > > > > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > > > > > +		pargs[i] = args[i];
> > > > > > +
> > > > > > +	for (i = 0; i < (uint32_t)argc; i++) {
> > > > > > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > > +			cfg_path_ptr = argv[i + 1];
> > > > > > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > > +			rst_path_ptr = argv[i + 1];
> > > > > > +	}
> > > > > > +	if (cfg_path_ptr == NULL) {
> > > > > > +		printf("Config file not assigned.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +	if (rst_path_ptr == NULL) {
> > > > > > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > > > > > +		strcat(strtok(basename(rst_path), "."),
> > "_result.csv");
> > > > > > +		rst_path_ptr = rst_path;
> > > > > > +	}
> > > > > > +
> > > > > > +	case_nb = load_configs(cfg_path_ptr);
> > > > > > +	fd = fopen(rst_path_ptr, "w");
> > > > > > +	if (fd == NULL) {
> > > > > > +		printf("Open output CSV file error.\n");
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +	fclose(fd);
> > > > > > +
> > > > > > +	for (i = 0; i < case_nb; i++) {
> > > > > > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > > > > > +			printf("No test type in test case %d.\n\n", i +
> > 1);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +		if (!test_cases[i].is_valid) {
> > > > > > +			printf("Invalid test case %d.\n\n", i + 1);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		cpid = fork();
> > > > >
> > > > > [Anoob] Do we really need fork()? Can't we use code like,
> > > > >
> > > > > 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> > > > > 			ret |= rte_eal_wait_lcore(lcore_id);
> > > > > 		}
> > > > >
> > > > > to wait for all threads to exit?
> > > >
> > > > [Cheng] Good question. Fork() is used here to establish a new process
> > > > for the new test case. In order for each test case to have a new EAL
> > > > environment (for the flexibility), the EAL must be reinitialized for each
> > case.
> > > > However, the EAL parameters can only be initialized once per process.
> > > > Therefore, we use a new process to run each new test case. Moreover,
> > > > each test case runs sequentially and does not affect the others,
> > > > ensuring the accuracy of the performance data. Your code would wait
> > > > for all threads to exit in the same process. However, it would not
> provide
> > a
> > > "clean"
> > > > environment for each test case like fork() does. Fork() allows us to
> > > > have a fully reinitialized environment, with no impact or side effects
> > > > from previous test cases. This results in clean, precise performance data
> > for
> > > each case.
> > > >
> > > > Please let me know your thoughts on this. And please let me know if
> > > > you have any other questions or require any clarification.
> > >
> > > [Anoob] This was just a generic observation. I do not have a strong
> opinion
> > > either way.
> > >
> >
> > [Cheng] sure, got it.
> >
> > > >
> > > > Thanks,
> > > > Cheng
> > > >
> > > > >
> > > > > > +		if (cpid < 0) {
> > > > > > +			printf("Fork case %d failed.\n", i + 1);
> > > > > > +			exit(EXIT_FAILURE);
> > > > > > +		} else if (cpid == 0) {
> > > > > > +			printf("\nRunning case %u\n\n", i + 1);
> > > > > > +
> > > > > > +			new_argc = append_eal_args(argc, argv,
> > > > > > test_cases[i].eal_args, pargs);
> > > > > > +			ret = rte_eal_init(new_argc, pargs);
> > > > > > +			if (ret < 0)
> > > > > > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > > > > > arguments\n");
> > > > > > +
> > > > > > +			/* Check lcores. */
> > > > > > +			nb_lcores = rte_lcore_count();
> > > > > > +			if (nb_lcores < 2)
> > > > > > +				rte_exit(EXIT_FAILURE,
> > > > > > +					"There should be at least 2
> > worker
> > > > > > lcores.\n");
> > > > > > +
> > > > > > +			fd = fopen(rst_path_ptr, "a");
> > > > > > +			if (!fd) {
> > > > > > +				printf("Open output CSV file
> > error.\n");
> > > > > > +				return 0;
> > > > > > +			}
> > > > > > +
> > > > > > +			if (is_first_case) {
> > > > > > +				output_env_info();
> > > > > > +				is_first_case = false;
> > > > > > +			}
> > > > > > +			run_test(i + 1, &test_cases[i]);
> > > > > > +
> > > > > > +			/* clean up the EAL */
> > > > > > +			rte_eal_cleanup();
> > > > > > +
> > > > > > +			fclose(fd);
> > > > > > +
> > > > > > +			printf("\nCase %u completed.\n\n", i + 1);
> > > > > > +
> > > > > > +			exit(EXIT_SUCCESS);
> > > > > > +		} else {
> > > > > > +			wpid = waitpid(cpid, &wstatus, 0);
> > > > > > +			if (wpid == -1) {
> > > > > > +				printf("waitpid error.\n");
> > > > > > +				exit(EXIT_FAILURE);
> > > > > > +			}
> > > > > > +
> > > > > > +			if (WIFEXITED(wstatus))
> > > > > > +				printf("Case process exited. status
> > %d\n\n",
> > > > > > +					WEXITSTATUS(wstatus));
> > > > > > +			else if (WIFSIGNALED(wstatus))
> > > > > > +				printf("Case process killed by signal
> > %d\n\n",
> > > > > > +					WTERMSIG(wstatus));
> > > > > > +			else if (WIFSTOPPED(wstatus))
> > > > > > +				printf("Case process stopped by
> > signal
> > > > > > %d\n\n",
> > > > > > +					WSTOPSIG(wstatus));
> > > > > > +			else if (WIFCONTINUED(wstatus))
> > > > > > +				printf("Case process
> > continued.\n\n");
> > > > > > +			else
> > > > > > +				printf("Case process unknown
> > > > > > terminated.\n\n");
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	printf("Bye...\n");
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> > > > > > new file mode 100644 index 0000000000..215ac42673
> > > > > > --- /dev/null
> > > > > > +++ b/app/test-dma-perf/main.h
> > > > > > @@ -0,0 +1,69 @@
> > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > +
> > > > > > +#ifndef _MAIN_H_
> > > > > > +#define _MAIN_H_
> > > > > > +
> > > > > > +
> > > > > > +#include <rte_common.h>
> > > > > > +#include <rte_cycles.h>
> > > > > > +#include <rte_dev.h>
> > > > > > +#include <rte_dmadev.h>
> > > > > > +
> > > > > > +#ifndef __maybe_unused
> > > > > > +#define __maybe_unused	__rte_unused
> > > > > > +#endif
> > > > > > +
> > > > > > +#define MAX_WORKER_NB 128
> > > > > > +#define MAX_OUTPUT_STR_LEN 512
> > > > > > +
> > > > > > +#define MAX_DMA_NB 128
> > > > > > +#define MAX_LCORE_NB 256
> > > > > > +
> > > > > > +extern char
> > output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > > +
> > > > > > +typedef enum {
> > > > > > +	OP_NONE = 0,
> > > > > > +	OP_ADD,
> > > > > > +	OP_MUL
> > > > > > +} alg_op_type;
> > > > > > +
> > > > > > +struct test_configure_entry {
> > > > > > +	uint32_t first;
> > > > > > +	uint32_t last;
> > > > > > +	uint32_t incr;
> > > > > > +	alg_op_type op;
> > > > > > +	uint32_t cur;
> > > > > > +};
> > > > > > +
> > > > > > +struct lcore_dma_map_t {
> > > > > > +	uint32_t lcores[MAX_WORKER_NB];
> > > > > > +	char
> > dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > > > > > +	int16_t dma_ids[MAX_WORKER_NB];
> > > > > > +	uint16_t cnt;
> > > > > > +};
> > > > > > +
> > > > > > +struct test_configure {
> > > > > > +	bool is_valid;
> > > > > > +	uint8_t test_type;
> > > > > > +	const char *test_type_str;
> > > > > > +	uint16_t src_numa_node;
> > > > > > +	uint16_t dst_numa_node;
> > > > > > +	uint16_t opcode;
> > > > > > +	bool is_dma;
> > > > > > +	struct lcore_dma_map_t lcore_dma_map;
> > > > > > +	struct test_configure_entry mem_size;
> > > > > > +	struct test_configure_entry buf_size;
> > > > > > +	struct test_configure_entry ring_size;
> > > > > > +	struct test_configure_entry kick_batch;
> > > > > > +	uint32_t cache_flush;
> > > > > > +	uint32_t nr_buf;
> > > > > > +	uint16_t test_secs;
> > > > > > +	const char *eal_args;
> > > > > > +	uint8_t scenario_id;
> > > > > > +};
> > > > > > +
> > > > > > +void mem_copy_benchmark(struct test_configure *cfg, bool
> > is_dma);
> > > > > > +
> > > > > > +#endif /* _MAIN_H_ */
> > > > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > > > perf/meson.build new file mode 100644 index
> > 0000000000..bd6c264002
> > > > > > --- /dev/null
> > > > > > +++ b/app/test-dma-perf/meson.build
> > > > > > @@ -0,0 +1,17 @@
> > > > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > > > > +Intel Corporation
> > > > > > +
> > > > > > +# meson file, for building this app as part of a main DPDK build.
> > > > > > +
> > > > > > +if is_windows
> > > > > > +    build = false
> > > > > > +    reason = 'not supported on Windows'
> > > > > > +    subdir_done()
> > > > > > +endif
> > > > > > +
> > > > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > > > +
> > > > > > +sources = files(
> > > > > > +        'main.c',
> > > > > > +        'benchmark.c',
> > > > > > +)
> > > > > > --
> > > > > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16  2:56             ` Jiang, Cheng1
@ 2023-06-16  6:32               ` Anoob Joseph
  2023-06-16  8:43                 ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-16  6:32 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Cheng,

Please see inline.

Thanks,
Anoob

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Friday, June 16, 2023 8:26 AM
> To: Anoob Joseph <anoobj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Anoob,
> 
> Replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Thursday, June 15, 2023 11:48 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> application
> >
> > Hi Cheng,
> >
> > Please see inline.
> >
> > Thanks,
> > Anoob
> >
> > > -----Original Message-----
> > > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Sent: Thursday, June 15, 2023 7:36 PM
> > > To: Anoob Joseph <anoobj@marvell.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> > >
> > > Hi Anoob,
> > >
> > > Replies are inline.
> > >
> > > Thanks,
> > > Cheng
> > >
> > > > -----Original Message-----
> > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > Sent: Thursday, June 15, 2023 4:45 PM
> > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> > Chenbo
> > > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > > <amitprakashs@marvell.com>; Ma, WenwuX
> <wenwux.ma@intel.com>;
> > > > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > > <xingguang.he@intel.com>
> > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > > >
> > > > Hi Cheng,
> > > >
> > > > Please see inline.
> > > >
> > > > Thanks,
> > > > Anoob
> > > >
> > > > > -----Original Message-----
> > > > > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > > Sent: Thursday, June 15, 2023 1:31 PM
> > > > > To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> > > Amit
> > > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang,
> > > > YuanX
> > > > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > application
> > > > >
> > > > > Hi,
> > > > >
> > > > > Thanks for your comments, the replies are inline.
> > > > >
> > > > > Thanks,
> > > > > Cheng
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > > > Sent: Thursday, June 15, 2023 1:22 PM
> > > > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>;
> thomas@monjalon.net;
> > > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > > mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>;
> > > > Amit
> > > > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > Wang,
> > > > > YuanX
> > > > > > <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> > > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > > application
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > Thanks for working on the comments. Few more top level comment
> > > > inline.
> > > > > >
> > > > > > Thanks,
> > > > > > Anoob
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > > Sent: Tuesday, June 13, 2023 10:02 AM
> > > > > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > > > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit
> Prakash
> > > > > Shukla
> > > > > > > <amitprakashs@marvell.com>; Anoob Joseph
> > > <anoobj@marvell.com>
> > > > > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > > > > xingguang.he@intel.com;
> > > > > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > > > application
> > > > > > >
> > > > > > > External Email
> > > > > > >
> > > > > > > ------------------------------------------------------------------
> > > > > > > --
> > > > > > > -- There are many high-performance DMA devices supported in
> > DPDK
> > > > > > > now, and these DMA devices can also be integrated into other
> > > > > > > modules of DPDK as accelerators, such as Vhost. Before
> integrating
> > > > > > > DMA into applications, developers need to know the performance
> > of
> > > > > > > these DMA devices in various scenarios and the performance of
> > CPUs
> > > > > > > in the same scenario, such as different buffer lengths. Only in
> > > > > > > this way can we know the target performance of the application
> > > > > > > accelerated by using them. This patch introduces a
> > > > > > > high-performance testing tool, which supports comparing the
> > > > > > > performance of CPU and DMA in different scenarios automatically
> > > > > > > with a pre- set config file. Memory Copy performance test are
> > > > > > supported for now.
> > > > > > >
> > > > > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > > > > ---
> > > > > > > v6:
> > > > > > >   improved code based on Anoob's comments;
> > > > > > >   fixed some code structure issues;
> > > > > > > v5:
> > > > > > >   fixed some LONG_LINE warnings;
> > > > > > > v4:
> > > > > > >   fixed inaccuracy of the memory footprint display;
> > > > > > > v3:
> > > > > > >   fixed some typos;
> > > > > > > v2:
> > > > > > >   added lcore/dmadev designation;
> > > > > > >   added error case process;
> > > > > > >   removed worker_threads parameter from config.ini;
> > > > > > >   improved the logs;
> > > > > > >   improved config file;
> > > > > > >
> > > > > > >  app/meson.build               |   1 +
> > > > > > >  app/test-dma-perf/benchmark.c | 477
> > > > > ++++++++++++++++++++++++++++
> > > > > > > app/test-dma-perf/config.ini  |  59 ++++
> > > > > > >  app/test-dma-perf/main.c      | 569
> > > > > > > ++++++++++++++++++++++++++++++++++
> > > > > > >  app/test-dma-perf/main.h      |  69 +++++
> > > > > > >  app/test-dma-perf/meson.build |  17 +
> > > > > > >  6 files changed, 1192 insertions(+)  create mode 100644
> > > > > > > app/test-dma-perf/benchmark.c  create mode 100644
> > > > > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h
> create
> > > > > > > mode
> > > > > > > 100644 app/test-dma-perf/meson.build
> > > > > > >
> > > > > > > diff --git a/app/meson.build b/app/meson.build index
> > > > > > > 74d2420f67..4fc1a83eba 100644
> > > > > > > --- a/app/meson.build
> > > > > > > +++ b/app/meson.build
> > > > > > > @@ -19,6 +19,7 @@ apps = [
> > > > > > >          'test-cmdline',
> > > > > > >          'test-compress-perf',
> > > > > > >          'test-crypto-perf',
> > > > > > > +        'test-dma-perf',
> > > > > > >          'test-eventdev',
> > > > > > >          'test-fib',
> > > > > > >          'test-flow-perf',
> > > > > > > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > > > > > > perf/benchmark.c new file mode 100644 index
> > > 0000000000..bc1ca82297
> > > > > > > --- /dev/null
> > > > > > > +++ b/app/test-dma-perf/benchmark.c
> > > > > > > @@ -0,0 +1,477 @@
> > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > > +
> > > > > > > +#include <inttypes.h>
> > > > > > > +#include <stdio.h>
> > > > > > > +#include <stdlib.h>
> > > > > > > +#include <unistd.h>
> > > > > > > +
> > > > > > > +#include <rte_time.h>
> > > > > > > +#include <rte_mbuf.h>
> > > > > > > +#include <rte_dmadev.h>
> > > > > > > +#include <rte_malloc.h>
> > > > > > > +#include <rte_lcore.h>
> > > > > > > +
> > > > > > > +#include "main.h"
> > > > > > > +
> > > > > > > +#define MAX_DMA_CPL_NB 255
> > > > > > > +
> > > > > > > +#define TEST_WAIT_U_SECOND 10000
> > > > > > > +
> > > > > > > +#define CSV_LINE_DMA_FMT "Scenario
> %u,%u,%s,%u,%u,%.2lf,%"
> > > > > PRIu64
> > > > > > > ",%.3lf,%.3lf\n"
> > > > > > > +#define CSV_LINE_CPU_FMT "Scenario
> %u,%u,NA,%u,%u,%.2lf,%"
> > > > > PRIu64
> > > > > > > ",%.3lf,%.3lf\n"
> > > > > > > +
> > > > > > > +struct worker_info {
> > > > > > > +	bool ready_flag;
> > > > > > > +	bool start_flag;
> > > > > > > +	bool stop_flag;
> > > > > > > +	uint32_t total_cpl;
> > > > > > > +	uint32_t test_cpl;
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct lcore_params {
> > > > > > > +	uint8_t scenario_id;
> > > > > > > +	unsigned int lcore_id;
> > > > > > > +	char *dma_name;
> > > > > > > +	uint16_t worker_id;
> > > > > > > +	uint16_t dev_id;
> > > > > > > +	uint32_t nr_buf;
> > > > > > > +	uint16_t kick_batch;
> > > > > > > +	uint32_t buf_size;
> > > > > > > +	uint16_t test_secs;
> > > > > > > +	struct rte_mbuf **srcs;
> > > > > > > +	struct rte_mbuf **dsts;
> > > > > > > +	struct worker_info worker_info;
> > > > > > > +};
> > > > > > > +
> > > > > > > +static struct rte_mempool *src_pool; static struct rte_mempool
> > > > > > > +*dst_pool;
> > > > > > > +
> > > > > > > +static volatile struct lcore_params
> > > > > *worker_params[MAX_WORKER_NB];
> > > > > > > +
> > > > > > > +#define PRINT_ERR(...) print_err(__func__, __LINE__,
> > > __VA_ARGS__)
> > > > > > > +
> > > > > > > +static inline int
> > > > > > > +__rte_format_printf(3, 4)
> > > > > > > +print_err(const char *func, int lineno, const char *format, ...) {
> > > > > > > +	va_list ap;
> > > > > > > +	int ret;
> > > > > > > +
> > > > > > > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > > > > > > +	va_start(ap, format);
> > > > > > > +	ret += vfprintf(stderr, format, ap);
> > > > > > > +	va_end(ap);
> > > > > > > +
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static inline void
> > > > > > > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t
> > > > > > > +nb_workers,
> > > > > > > uint16_t test_secs,
> > > > > > > +				uint32_t total_cnt, float *memory,
> > > uint32_t
> > > > > > > *ave_cycle,
> > > > > > > +				float *bandwidth, float *mops) {
> > > > > > > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) /
> > > (1024
> > > > > > > +*
> > > > > > > 1024);
> > > > > > > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > > > > > > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > > > > > > (float)*ave_cycle)) / 1000000000;
> >
> > [Anoob] The above calculation may not yield actual results. 'ave_cycle'
> would
> > get converted to integer and then bandwidth would be allowed to report
> > only very few values. Instead, we can do the calculation directly like,
> >
> > 	*bandwidth = ((float)buf_size * 8 * total_cnt / test_secs) /
> > 1000000000;
> > 	*mops = (float)total_cnt / test_secs / 1000000;
> >
> > Same issue is there with below calculation as well. Please check.
> 
> [Cheng] Yes, I've noticed as well. Dengdui also mentioned this in his
> comments. I will address this issue in v7. Thank you very much.
> 
> >
> > Side note: in bandwidth calculation, shouldn't we be dividing by
> > 1024*1024*1024? I've just carried the calculation that you used. Feel free to
> > correct as required.
> 
> [Cheng] The unit I'm using in my calculations is Gb/s (Gigabits per second),
> which is based on the decimal system. Therefore, I use the factor of 1000^3
> (or 1,000,000,000).
> The method you mentioned, dividing by 1024^3, is typically used when
> calculating GiB/s (Gibibits per second), a binary-based unit.
> I think both methods are acceptable as long as the units and calculation
> methods correspond.
> What do you think?

[Anoob] You are right. Existing logic is correct. Thanks for the explanation.

> 
> >
> > > > > > > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > > > > > > +
> > > > > > > +static void
> > > > > > > +output_result(uint8_t scenario_id, uint32_t lcore_id, char
> > > > > > > +*dma_name,
> > > > > > > uint64_t ave_cycle,
> > > > > > > +			uint32_t buf_size, uint32_t nr_buf, float
> > > memory,
> > > > > > > +			float bandwidth, float mops, bool is_dma) {
> > > > > > > +	if (is_dma)
> > > > > > > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > > > > > > +	else
> > > > > > > +		printf("lcore %u\n", lcore_id);
> > > > > > > +
> > > > > > > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u,
> > > nr_buf:
> > > > > > > +%u,
> > > > > > > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > > > > > > +			ave_cycle, buf_size, nr_buf, memory,
> > > > > > > rte_get_timer_hz());
> > > > > > > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n",
> > > bandwidth,
> > > > > > > +mops);
> > > > > > > +
> > > > > > > +	if (is_dma)
> > > > > > > +		snprintf(output_str[lcore_id],
> > > MAX_OUTPUT_STR_LEN,
> > > > > > > CSV_LINE_DMA_FMT,
> > > > > > > +			scenario_id, lcore_id, dma_name, buf_size,
> > > > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> > > mops);
> > > > > > > +	else
> > > > > > > +		snprintf(output_str[lcore_id],
> > > MAX_OUTPUT_STR_LEN,
> > > > > > > CSV_LINE_CPU_FMT,
> > > > > > > +			scenario_id, lcore_id, buf_size,
> > > > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> > > mops); }
> > > > > > > +
> > > > > > > +static inline void
> > > > > > > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > > > > > > +		__maybe_unused uint32_t buf_size,
> > > > > > > +		__maybe_unused uint32_t nr_buf) { #ifdef
> > > > RTE_ARCH_X86_64
> > > > > > > +	char *data;
> > > > > > > +	struct rte_mbuf **srcs = array;
> > > > > > > +	uint32_t i, offset;
> > > > > > > +
> > > > > > > +	for (i = 0; i < nr_buf; i++) {
> > > > > > > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > > > > > > +		for (offset = 0; offset < buf_size; offset += 64)
> > > > > > > +			__builtin_ia32_clflush(data + offset);
> > > > > > > +	}
> > > > > > > +#endif
> > > > > > > +}
> > > > > > > +
> > > > > > > +/* Configuration of device. */
> > > > > > > +static void
> > > > > > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > > > > > > +	uint16_t vchan = 0;
> > > > > > > +	struct rte_dma_info info;
> > > > > > > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > > > > > +	struct rte_dma_vchan_conf qconf = {
> > > > > > > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > > > > > +		.nb_desc = ring_size
> > > > > > > +	};
> > > > > > > +
> > > > > > > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma
> > > configure.\n");
> > > > > > > +
> > > > > > > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > > > > > > +		rte_exit(EXIT_FAILURE, "Error with queue
> > > configuration.\n");
> > > > > > > +
> > > > > > > +	rte_dma_info_get(dev_id, &info);
> > > > > > > +	if (info.nb_vchans != 1)
> > > > > > > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > > > > > > reported on device id. %u\n",
> > > > > > > +				dev_id);
> > > > > > > +
> > > > > > > +	if (rte_dma_start(dev_id) != 0)
> > > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > > > > > > +
> > > > > > > +static int
> > > > > > > +config_dmadevs(struct test_configure *cfg) {
> > > > > > > +	uint32_t ring_size = cfg->ring_size.cur;
> > > > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > > > +	uint32_t nb_workers = ldm->cnt;
> > > > > > > +	uint32_t i;
> > > > > > > +	int dev_id;
> > > > > > > +	uint16_t nb_dmadevs = 0;
> > > > > > > +	char *dma_name;
> > > > > > > +
> > > > > > > +	for (i = 0; i < ldm->cnt; i++) {
> > > > > > > +		dma_name = ldm->dma_names[i];
> > > > > > > +		dev_id =
> > > rte_dma_get_dev_id_by_name(dma_name);
> > > > > > > +		if (dev_id == -1) {
> > > > > > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > > > > > dma_name);
> > > > > > > +			goto end;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		ldm->dma_ids[i] = dev_id;
> > > > > > > +		configure_dmadev_queue(dev_id, ring_size);
> > > > > > > +		++nb_dmadevs;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +end:
> > > > > > > +	if (nb_dmadevs < nb_workers) {
> > > > > > > +		printf("Not enough dmadevs (%u) for all workers
> > > (%u).\n",
> > > > > > > nb_dmadevs, nb_workers);
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > > > > > +
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +#define POLL_MAX 1000
> > > > > > > +
> > > > > > > +
> > > > > >
> > > > > > [Anoob] Extra blank line. You can consider removing.
> > > > >
> > > > > [Cheng] sure, sorry for the miss.
> > > > >
> > > > > >
> > > > > > > +static inline void
> > > > > > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t
> *async_cnt,
> > > > > > > +			volatile struct worker_info *worker_info) {
> > > > > > > +	int ret;
> > > > > > > +	uint16_t nr_cpl;
> > > > > > > +
> > > > > > > +	ret = rte_dma_submit(dev_id, 0);
> > > > > > > +	if (ret < 0) {
> > > > > > > +		rte_dma_stop(dev_id);
> > > > > > > +		rte_dma_close(dev_id);
> > > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL,
> > > > > > > NULL);
> > > > > > > +	*async_cnt -= nr_cpl;
> > > > > > > +	worker_info->total_cpl += nr_cpl; }
> > > > > > > +
> > > > > > > +static inline int
> > > > > > > +do_dma_mem_copy(void *p)
> > > > > > > +{
> > > > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > > > +	volatile struct lcore_params *para =
> > > worker_params[*para_idx];
> > > > > > > +	volatile struct worker_info *worker_info = &(para-
> > > >worker_info);
> > > > > > > +	const uint16_t dev_id = para->dev_id;
> > > > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > > > +	const uint16_t kick_batch = para->kick_batch;
> > > > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > > > +	uint16_t nr_cpl;
> > > > > > > +	uint64_t async_cnt = 0;
> > > > > > > +	uint32_t i;
> > > > > > > +	uint32_t poll_cnt = 0;
> > > > > > > +	int ret;
> > > > > > > +
> > > > > > > +	worker_info->stop_flag = false;
> > > > > > > +	worker_info->ready_flag = true;
> > > > > > > +
> > > > > > > +	while (!worker_info->start_flag)
> > > > > > > +		;
> > > > > > > +
> > > > > > > +	while (1) {
> > > > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > > > +dma_copy:
> > > > > > > +			ret = rte_dma_copy(dev_id, 0,
> > > > > > > rte_pktmbuf_iova(srcs[i]),
> > > > > > > +				rte_pktmbuf_iova(dsts[i]), buf_size,
> > > 0);
> > > > > > > +			if (unlikely(ret < 0)) {
> > > > > > > +				if (ret == -ENOSPC) {
> > > > > > > +
> > > 	do_dma_submit_and_poll(dev_id,
> > > > > > > &async_cnt, worker_info);
> > > > > > > +					goto dma_copy;
> > > > > > > +				} else {
> > > > > > > +					/* Error exit */
> > > > > > > +					rte_dma_stop(dev_id);
> > > > > > > +					rte_exit(EXIT_FAILURE,
> > > "DMA
> > > > > > > enqueue failed\n");
> > > > > > > +				}
> > > > > > > +			}
> > > > > > > +			async_cnt++;
> > > > > > > +
> > > > > > > +			if ((async_cnt % kick_batch) == 0)
> > > > > > > +				do_dma_submit_and_poll(dev_id,
> > > > > > > &async_cnt, worker_info);
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		if (worker_info->stop_flag)
> > > > > > > +			break;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	rte_dma_submit(dev_id, 0);
> > > > > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > > > > +		async_cnt -= nr_cpl;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static inline int
> > > > > > > +do_cpu_mem_copy(void *p)
> > > > > > > +{
> > > > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > > > +	volatile struct lcore_params *para =
> > > worker_params[*para_idx];
> > > > > > > +	volatile struct worker_info *worker_info = &(para-
> > > >worker_info);
> > > > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > > > +	uint32_t i;
> > > > > > > +
> > > > > > > +	worker_info->stop_flag = false;
> > > > > > > +	worker_info->ready_flag = true;
> > > > > > > +
> > > > > > > +	while (!worker_info->start_flag)
> > > > > > > +		;
> > > > > > > +
> > > > > > > +	while (1) {
> > > > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > > > +			/* copy buffer form src to dst */
> > > > > > > +			rte_memcpy((void
> > > > > > > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > > > > > > +				(void
> > > > > > > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > > > > > > +				(size_t)buf_size);
> > > > > > > +			worker_info->total_cpl++;
> > > > > > > +		}
> > > > > > > +		if (worker_info->stop_flag)
> > > > > > > +			break;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf
> > > > > ***srcs,
> > > > > > > +			struct rte_mbuf ***dsts)
> > > > > > > +{
> > > > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > > > +	unsigned int nr_sockets;
> > > > > > > +	uint32_t nr_buf = cfg->nr_buf;
> > > > > > > +
> > > > > > > +	nr_sockets = rte_socket_count();
> > > > > > > +	if (cfg->src_numa_node >= nr_sockets ||
> > > > > > > +		cfg->dst_numa_node >= nr_sockets) {
> > > > > > > +		printf("Error: Source or destination numa exceeds
> > > the acture
> > > > > > > numa nodes.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	src_pool =
> > > rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > > > > > > +			nr_buf, /* n == num elements */
> > > > > > > +			64,  /* cache size */
> > > > > > > +			0,   /* priv size */
> > > > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > > > +			cfg->src_numa_node);
> > > > > > > +	if (src_pool == NULL) {
> > > > > > > +		PRINT_ERR("Error with source mempool
> > > creation.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	dst_pool =
> > > rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > > > > > > +			nr_buf, /* n == num elements */
> > > > > > > +			64,  /* cache size */
> > > > > >
> > > > > > [Anoob] We do not alloc or free pointers in the datapath, right? So
> > > > > > why bother with cache?
> > > > >
> > > > > [Cheng] Yes, you are right, the cache size is not necessary here, I'll
> > > > > fix it in the next version.
> > > > >
> > > > > >
> > > > > > > +			0,   /* priv size */
> > > > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > > > +			cfg->dst_numa_node);
> > > > > > > +	if (dst_pool == NULL) {
> > > > > > > +		PRINT_ERR("Error with destination mempool
> > > creation.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> > > 0);
> > > > > > > +	if (*srcs == NULL) {
> > > > > > > +		printf("Error: srcs malloc failed.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > >
> > > > > > [Anoob] Are we freeing these memory? The ones allocated with
> > > > > rte_malloc.
> > > > >
> > > > > [Cheng] yes, we freed the memory in the end of
> > > mem_copy_benchmark()
> > > > > when we finished the test.
> > > >
> > > > [Anoob] I think we are not freeing this mem. In the place where we
> free
> > all
> > > > mem, we do free all objects to mempool as well as the mempools. But
> > this
> > > > memory is to hold the pointers, right? Is that getting freed anywhere?
> > > >
> > > > Also, in the mem clearing paths, do we need to clear the static variables
> > (ie,
> > > > set srcs, src_pool, dsts, dst_pool to NULL) so that there won't be any
> > scope
> > > > for any double free.
> > > >
> > >
> > > [Cheng] My apologies for the misunderstanding earlier. I now understand
> > > your point that you are right, the memory used to store the pointers is
> not
> > > being freed. I will fix this issue in the next version. Regarding the static
> > > variables you mentioned, I agree with your view that they should be
> > cleared.
> > > I will address this in the upcoming version as well. Thank you very much
> for
> > > the feedback. It is greatly appreciated.
> > >
> > > In addition, I think we also need to nullify these variables when initializing
> > > them to ensure safety and standardization of use. What do you think?
> >
> > [Anoob] Since these are static variables, it is probably okay to skip the init
> > part. But when we use it, we should clear it after use.
> >
> > Please check above. I've posted one more comment. In case you missed.
> >
> 
> [Cheng] sure, thanks for your advice, I'll clear it after use in the next version,
> thanks.
> 
> > >
> > > Thanks!
> > >
> > > > >
> > > > > >
> > > > > > > +
> > > > > > > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> > > 0);
> > > > > > > +	if (*dsts == NULL) {
> > > > > > > +		printf("Error: dsts malloc failed.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf)
> > > != 0) {
> > > > > > > +		printf("get src mbufs failed.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf)
> > > != 0) {
> > > > > > > +		printf("get dst mbufs failed.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
> {
> > > > > > > +	uint16_t i;
> > > > > > > +	uint32_t offset;
> > > > > > > +	unsigned int lcore_id = 0;
> > > > > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > > > > +1024) /
> > > > > > > (cfg->buf_size.cur * 2);
> > > > > > > +	uint16_t nb_workers = ldm->cnt;
> > > > > > > +	uint16_t test_secs = cfg->test_secs;
> > > > > > > +	float memory;
> > > > > > > +	uint32_t avg_cycles = 0;
> > > > > > > +	float mops;
> > > > > > > +	float bandwidth;
> > > > > > > +
> > > > > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > > > > +		goto out;
> > > > > > > +
> > > > > > > +	if (is_dma)
> > > > > > > +		if (config_dmadevs(cfg) < 0)
> > > > > > > +			goto out;
> > > > > > > +
> > > > > > > +	if (cfg->cache_flush) {
> > > > > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > > > > +		rte_mb();
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	printf("Start testing....\n");
> > > > > > > +
> > > > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > > > +		lcore_id = ldm->lcores[i];
> > > > > > > +		offset = nr_buf / nb_workers * i;
> > > > > > > +
> > > > > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > > > > lcore_params), 0);
> > > > > > > +		if (!worker_params[i]) {
> > > > > > > +			printf("lcore parameters malloc failure for
> > > lcore
> > > > > > > %d\n", lcore_id);
> > > > > > > +			break;
> > > > > > > +		}
> > > > > >
> > > > > > [Anoob] Are we freeing the above memory?
> > > > >
> > > > > [Cheng] sorry, I missed that, I'll add worker_params memory free in
> > > > > the next version, thanks.
> > > > >
> > > > > >
> > > > > > > +		if (is_dma) {
> > > > > > > +			worker_params[i]->dma_name = ldm-
> > > > > > > >dma_names[i];
> > > > > > > +			worker_params[i]->dev_id = ldm-
> > > >dma_ids[i];
> > > > > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > > > > +		}
> > > > > > > +		worker_params[i]->worker_id = i;
> > > > > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > > > > nb_workers);
> > > > > > > +		worker_params[i]->buf_size = buf_size;
> > > > > > > +		worker_params[i]->test_secs = test_secs;
> > > > > > > +		worker_params[i]->srcs = srcs + offset;
> > > > > > > +		worker_params[i]->dsts = dsts + offset;
> > > > > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > > > > +
> > > > > > > +		if (is_dma)
> > > > > > > +
> > > 	rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > > > > *)(&i), lcore_id);
> > > > > > > +		else
> > > > > > > +			rte_eal_remote_launch(do_cpu_mem_copy,
> > > (void
> > > > > > > *)(&i), lcore_id);
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	while (1) {
> > > > > > > +		bool ready = true;
> > > > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > > > +			if (worker_params[i]-
> > > >worker_info.ready_flag ==
> > > > > > > false) {
> > > > > > > +				ready = 0;
> > > > > > > +				break;
> > > > > > > +			}
> > > > > > > +		}
> > > > > > > +		if (ready)
> > > > > > > +			break;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > > > > +
> > > > > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > > > +worker_params[i]->worker_info.total_cpl;
> > > > > > > +
> > > > > > > +	usleep(test_secs * 1000 * 1000);
> > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > > > worker_params[i]->worker_info.total_cpl -
> > > > > > > +						worker_params[i]-
> > > > > > > >worker_info.test_cpl;
> > > > > > > +
> > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > > > > +
> > > > > > > +	rte_eal_mp_wait_lcore();
> > > > > > > +
> > > > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > > > > +			worker_params[i]->worker_info.test_cpl,
> > > > > > > +			&memory, &avg_cycles, &bandwidth,
> > > &mops);
> > > > > > > +		output_result(cfg->scenario_id, worker_params[i]-
> > > >lcore_id,
> > > > > > > +					worker_params[i]-
> > > >dma_name,
> > > > > > > avg_cycles, buf_size,
> > > > > > > +					nr_buf / nb_workers,
> > > memory,
> > > > > > > bandwidth, mops, is_dma);
> > > > > > > +	}
> > > > > > > +
> > > > > > > +out:
> > > > > > > +	/* free env */
> > > > > > > +	if (srcs)
> > > > > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > > > > +	if (dsts)
> > > > > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > > > > +
> > > > > > > +	if (src_pool)
> > > > > > > +		rte_mempool_free(src_pool);
> > > > > > > +	if (dst_pool)
> > > > > > > +		rte_mempool_free(dst_pool);
> > > > > > > +
> > > > > > > +	if (is_dma) {
> > > > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > > > +			printf("Stopping dmadev %d\n", ldm-
> > > >dma_ids[i]);
> > > > > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +}
> > > > > > > diff --git a/app/test-dma-perf/config.ini
> > > > > > > b/app/test-dma-perf/config.ini new file mode 100644 index
> > > > > > > 0000000000..2fd9c3c387
> > > > > > > --- /dev/null
> > > > > > > +++ b/app/test-dma-perf/config.ini
> > > > > > > @@ -0,0 +1,59 @@
> > > > > > > +
> > > > > > > +; This is an example configuration file for dma-perf, which
> > > > > > > +details the meanings of each parameter ; and instructions on
> how
> > > > > > > +to use dma-
> > > > > perf.
> > > > > > > +
> > > > > > > +; Supported test types are DMA_MEM_COPY and
> CPU_MEM_COPY.
> > > > > > > +
> > > > > > > +; Parameters:
> > > > > > > +; "mem_size" denotes the size of the memory footprint.
> > > > > > > +; "buf_size" denotes the memory size of a single operation.
> > > > > > > +; "dma_ring_size" denotes the dma ring buffer size. It should be
> > > > > > > +greater
> > > > > > > than 64 normally.
> > > > > > > +; "kick_batch" denotes the dma operation batch size, and should
> > > > > > > +be greater
> > > > > > > than 1 normally.
> > > > > > > +
> > > > > > > +; The format for variables is
> variable=first,last,increment,ADD|MUL.
> > > > > > > +
> > > > > > > +; src_numa_node is used to control the numa node where the
> > > source
> > > > > > > memory is allocated.
> > > > > > > +; dst_numa_node is used to control the numa node where the
> > > > > > > +destination
> > > > > > > memory is allocated.
> > > > > > > +
> > > > > > > +; cache_flush is used to determine whether or not the cache
> > > > > > > +should be flushed, with 1 indicating to ; flush and 0 indicating to
> > not
> > > > flush.
> > > > > > > +
> > > > > > > +; test_seconds controls the test time of the whole case.
> > > > > > > +
> > > > > > > +; To use DMA for a test, please specify the "lcore_dma"
> parameter.
> > > > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > > > +; make sure that the value of "lcore_dma" falls within their
> > > > > > > +range of the
> > > > > > > values.
> > > > > > > +
> > > > > > > +; To use CPU for a test, please specify the "lcore" parameter.
> > > > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > > > +; make sure that the value of "lcore" falls within their range of
> > > values.
> > > > > > > +
> > > > > > > +; To specify a configuration file, use the "--config" flag
> > > > > > > +followed by the path
> > > > > > > to the file.
> > > > > > > +
> > > > > > > +; To specify a result file, use the "--result" flag followed by
> > > > > > > +the path to the
> > > > > > > file.
> > > > > > > +; If you do not specify a result file, one will be generated with
> > > > > > > +the same name as the configuration ; file, with the addition of
> > > > > > > +"_result.csv" at
> > > > > > > the end.
> > > > > > > +
> > > > > > > +[case1]
> > > > > > > +type=DMA_MEM_COPY
> > > > > > > +mem_size=10
> > > > > > > +buf_size=64,8192,2,MUL
> > > > > > > +dma_ring_size=1024
> > > > > > > +kick_batch=32
> > > > > > > +src_numa_node=0
> > > > > > > +dst_numa_node=0
> > > > > > > +cache_flush=0
> > > > > > > +test_seconds=2
> > > > > > > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > > > > >
> > > > > > [Anoob] Isn't it better if we allow user to specify DMA dev ID
> > > > > > rather than the PCI DBDF?
> > > > > >
> > > > > > In the long run, I would expect config file to provide {core,
> > > > > > dma_dev_id, queue_id}
> > > > > >
> > > > > > Another thought is why to expose this at all? If we can restrict
> > > > > > this perf application to have one thread only use one vchan, then
> > > > > > application can easily create this mapping in run time. Unless you
> > > > > > want one thread to use 2 different vchans which may not be
> desirable
> > > > > since this is a standalone perf app.
> > > > >
> > > > > [Cheng] Thank you for the feedback.
> > > > > Here are my thoughts:
> > > > > Firstly, the user may not know which device the DMA dev ID
> > corresponds
> > > > > to, or which NUMA node it is on. In my example, I used the CBDMA
> > > > > environment, so I did not specify the work queue ID. When using
> DSA,
> > > > > the configuration would be something like lcore10@0000:00:04.2-q0
> > > > > which contains core, dma and work queue id. The reason for exposing
> > > > > these options is that we want the user to fully understand which
> cores
> > > > > and devices are being used so that they know exactly where the
> > > > > performance data is coming from. For example, performance when
> > cores
> > > > > and DMA devices are not on the same NUMA node, etc. This allows
> the
> > > > > testing scenario to be precise and flexible. If the application
> > > > > handles the mapping itself, the user loses control over the mapping
> > > > > and may not get the performance data they want. We believe control
> > > > > should be given to the user rather than the application.
> > > >
> > > > [Anoob] I understand your view points. Thanks for the explanation.
> > > >
> > >
> > > [Cheng] sure, no problem.
> > >
> > > > >
> > > > > >
> > > > > > > +eal_args=--in-memory --file-prefix=test
> > > > > > > +
> > > > > > > +[case2]
> > > > > > > +type=CPU_MEM_COPY
> > > > > > > +mem_size=10
> > > > > > > +buf_size=64,8192,2,MUL
> > > > > > > +src_numa_node=0
> > > > > > > +dst_numa_node=1
> > > > > > > +cache_flush=0
> > > > > > > +test_seconds=2
> > > > > > > +lcore = 3, 4
> > > > > > > +eal_args=--in-memory --no-pci
> > > > > > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-
> perf/main.c
> > > > > > > new file mode 100644 index 0000000000..d65655b87b
> > > > > > > --- /dev/null
> > > > > > > +++ b/app/test-dma-perf/main.c
> > > > > > > @@ -0,0 +1,569 @@
> > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > > +
> > > > > > > +#include <stdio.h>
> > > > > > > +#include <stdlib.h>
> > > > > > > +#include <getopt.h>
> > > > > > > +#include <signal.h>
> > > > > > > +#include <stdbool.h>
> > > > > > > +#include <unistd.h>
> > > > > > > +#include <sys/wait.h>
> > > > > > > +#include <inttypes.h>
> > > > > > > +#include <libgen.h>
> > > > > > > +
> > > > > > > +#include <rte_eal.h>
> > > > > > > +#include <rte_cfgfile.h>
> > > > > > > +#include <rte_string_fns.h>
> > > > > > > +#include <rte_lcore.h>
> > > > > > > +
> > > > > > > +#include "main.h"
> > > > > > > +
> > > > > > > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > > > > > > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > > > > > > +
> > > > > > > +#define MAX_EAL_PARAM_NB 100
> > > > > > > +#define MAX_EAL_PARAM_LEN 1024
> > > > > > > +
> > > > > > > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > > > > > > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > > > > > > +
> > > > > > > +#define CMDLINE_CONFIG_ARG "--config"
> > > > > > > +#define CMDLINE_RESULT_ARG "--result"
> > > > > > > +
> > > > > > > +#define MAX_PARAMS_PER_ENTRY 4
> > > > > > > +
> > > > > > > +#define MAX_LONG_OPT_SZ 64
> > > > > > > +
> > > > > > > +enum {
> > > > > > > +	TEST_TYPE_NONE = 0,
> > > > > > > +	TEST_TYPE_DMA_MEM_COPY,
> > > > > > > +	TEST_TYPE_CPU_MEM_COPY
> > > > > > > +};
> > > > > > > +
> > > > > > > +#define MAX_TEST_CASES 16
> > > > > > > +static struct test_configure test_cases[MAX_TEST_CASES];
> > > > > > > +
> > > > > > > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > > > +
> > > > > > > +static FILE *fd;
> > > > > > > +
> > > > > > > +static void
> > > > > > > +output_csv(bool need_blankline)
> > > > > > > +{
> > > > > > > +	uint32_t i;
> > > > > > > +
> > > > > > > +	if (need_blankline) {
> > > > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > > > > > > +		if (output_str[i][0]) {
> > > > > > > +			fprintf(fd, "%s", output_str[i]);
> > > > > > > +			output_str[i][0] = '\0';
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	fflush(fd);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +output_env_info(void)
> > > > > > > +{
> > > > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > > > > > > environment:\n");
> > > > > > > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU
> > > frequency,%"
> > > > > > > +			PRIu64 "\n", rte_get_timer_hz());
> > > > > > > +
> > > > > > > +	output_csv(true);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +output_header(uint32_t case_id, struct test_configure
> *case_cfg) {
> > > > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > > > > > > +			CSV_HDR_FMT, case_id, case_cfg-
> > > >test_type_str);
> > > > > > > +
> > > > > > > +	output_csv(true);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +run_test_case(struct test_configure *case_cfg) {
> > > > > > > +	switch (case_cfg->test_type) {
> > > > > > > +	case TEST_TYPE_DMA_MEM_COPY:
> > > > > > > +		mem_copy_benchmark(case_cfg, true);
> > > > > > > +		break;
> > > > > > > +	case TEST_TYPE_CPU_MEM_COPY:
> > > > > > > +		mem_copy_benchmark(case_cfg, false);
> > > > > > > +		break;
> > > > > > > +	default:
> > > > > > > +		printf("Unknown test type. %s\n", case_cfg-
> > > >test_type_str);
> > > > > > > +		break;
> > > > > > > +	}
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > > > > > > +	uint32_t i;
> > > > > > > +	uint32_t nb_lcores = rte_lcore_count();
> > > > > > > +	struct test_configure_entry *mem_size = &case_cfg-
> > > >mem_size;
> > > > > > > +	struct test_configure_entry *buf_size = &case_cfg-
> > > >buf_size;
> > > > > > > +	struct test_configure_entry *ring_size = &case_cfg-
> > > >ring_size;
> > > > > > > +	struct test_configure_entry *kick_batch = &case_cfg-
> > > >kick_batch;
> > > > > > > +	struct test_configure_entry dummy = { 0 };
> > > > > > > +	struct test_configure_entry *var_entry = &dummy;
> > > > > > > +
> > > > > > > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > > > > > > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > > > > > > +
> > > > > > > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > > > > > > +		printf("Case %u: Not enough lcores.\n", case_id);
> > > > > > > +		return;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > > > > > > +
> > > > > > > +	if (mem_size->incr != 0)
> > > > > > > +		var_entry = mem_size;
> > > > > > > +
> > > > > > > +	if (buf_size->incr != 0)
> > > > > > > +		var_entry = buf_size;
> > > > > > > +
> > > > > > > +	if (ring_size->incr != 0)
> > > > > > > +		var_entry = ring_size;
> > > > > > > +
> > > > > > > +	if (kick_batch->incr != 0)
> > > > > > > +		var_entry = kick_batch;
> > > > > > > +
> > > > > > > +	case_cfg->scenario_id = 0;
> > > > > > > +
> > > > > > > +	output_header(case_id, case_cfg);
> > > > > > > +
> > > > > > > +	for (var_entry->cur = var_entry->first; var_entry->cur <=
> > > > > > > +var_entry-
> > > > > > > >last;) {
> > > > > > > +		case_cfg->scenario_id++;
> > > > > > > +		printf("\nRunning scenario %d\n", case_cfg-
> > > >scenario_id);
> > > > > > > +
> > > > > > > +		run_test_case(case_cfg);
> > > > > > > +		output_csv(false);
> > > > > > > +
> > > > > > > +		if (var_entry->op == OP_ADD)
> > > > > > > +			var_entry->cur += var_entry->incr;
> > > > > > > +		else if (var_entry->op == OP_MUL)
> > > > > > > +			var_entry->cur *= var_entry->incr;
> > > > > > > +		else
> > > > > > > +			break;
> > > > > > > +	}
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +parse_lcore(struct test_configure *test_case, const char *value)
> {
> > > > > > > +	size_t len = strlen(value);
> > > > > > > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > > > > > > +	strcpy(input, value);
> > > > > > > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > > > > > > >lcore_dma_map);
> > > > > > > +
> > > > > > > +	if (test_case == NULL || value == NULL)
> > > > > > > +		return -1;
> > > > > > > +
> > > > > > > +	memset(lcore_dma_map, 0, sizeof(struct
> > > lcore_dma_map_t));
> > > > > > > +
> > > > > > > +	char *token = strtok(input, ", ");
> > > > > > > +	while (token != NULL) {
> > > > > > > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > > > > > > +			free(input);
> > > > > > > +			return -1;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		uint16_t lcore_id = atoi(token);
> > > > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] =
> > > lcore_id;
> > > > > > > +
> > > > > > > +		token = strtok(NULL, ", ");
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	free(input);
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +parse_lcore_dma(struct test_configure *test_case, const char
> > > *value)
> > > > {
> > > > > > > +	struct lcore_dma_map_t *lcore_dma_map;
> > > > > > > +	char *input = strndup(value, strlen(value) + 1);
> > > > > > > +	char *addrs = input;
> > > > > > > +	char *ptrs[2];
> > > > > > > +	char *start, *end, *substr;
> > > > > > > +	uint16_t lcore_id;
> > > > > > > +	int ret = 0;
> > > > > > > +
> > > > > > > +	while (*addrs == '\0')
> > > > > > > +		addrs++;
> > > > > > > +	if (*addrs == '\0') {
> > > > > > > +		fprintf(stderr, "No input DMA addresses\n");
> > > > > > > +		ret = -1;
> > > > > > > +		goto out;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	substr = strtok(addrs, ",");
> > > > > > > +	if (substr == NULL) {
> > > > > > > +		fprintf(stderr, "No input DMA address\n");
> > > > > > > +		ret = -1;
> > > > > > > +		goto out;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > > > > > > lcore_dma_map_t));
> > > > > > > +
> > > > > > > +	do {
> > > > > > > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > > > > > > +
> > > > > > > +		start = strstr(ptrs[0], "lcore");
> > > > > > > +		if (start == NULL) {
> > > > > > > +			fprintf(stderr, "Illegal lcore\n");
> > > > > > > +			ret = -1;
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		start += 5;
> > > > > > > +		lcore_id = strtol(start, &end, 0);
> > > > > > > +		if (end == start) {
> > > > > > > +			fprintf(stderr, "No input lcore ID or ID %d is
> > > > > > > wrong\n", lcore_id);
> > > > > > > +			ret = -1;
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		lcore_dma_map = &test_case->lcore_dma_map;
> > > > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt] =
> > > lcore_id;
> > > > > > > +		strcpy(lcore_dma_map-
> > > >dma_names[lcore_dma_map-
> > > > > > > >cnt], ptrs[1]);
> > > > > > > +		lcore_dma_map->cnt++;
> > > > > > > +		substr = strtok(NULL, ",");
> > > > > > > +	} while (substr != NULL);
> > > > > > > +
> > > > > > > +out:
> > > > > > > +	free(input);
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +parse_entry(const char *value, struct test_configure_entry
> *entry)
> > > {
> > > > > > > +	char input[255] = {0};
> > > > > > > +	char *args[MAX_PARAMS_PER_ENTRY];
> > > > > > > +	int args_nr = -1;
> > > > > > > +
> > > > > > > +	if (value == NULL || entry == NULL)
> > > > > > > +		goto out;
> > > > > > > +
> > > > > > > +	strncpy(input, value, 254);
> > > > > > > +	if (*input == '\0')
> > > > > > > +		goto out;
> > > > > > > +
> > > > > > > +	args_nr = rte_strsplit(input, strlen(input), args,
> > > > > > > MAX_PARAMS_PER_ENTRY, ',');
> > > > > > > +	if (args_nr != 1 && args_nr != 4)
> > > > > > > +		goto out;
> > > > > > > +
> > > > > > > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > > > > > > +
> > > > > > > +	if (args_nr == 4) {
> > > > > > > +		entry->last = (uint32_t)atoi(args[1]);
> > > > > > > +		entry->incr = (uint32_t)atoi(args[2]);
> > > > > > > +		if (!strcmp(args[3], "MUL"))
> > > > > > > +			entry->op = OP_MUL;
> > > > > > > +		else if (!strcmp(args[3], "ADD"))
> > > > > > > +			entry->op = OP_ADD;
> > > > > > > +		else {
> > > > > > > +			printf("Invalid op %s.\n", args[3]);
> > > > > > > +			args_nr = -1;
> > > > > > > +		}
> > > > > > > +	} else {
> > > > > > > +		entry->op = OP_NONE;
> > > > > > > +		entry->last = 0;
> > > > > > > +		entry->incr = 0;
> > > > > > > +	}
> > > > > > > +out:
> > > > > > > +	return args_nr;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static uint16_t
> > > > > > > +load_configs(const char *path)
> > > > > > > +{
> > > > > > > +	struct rte_cfgfile *cfgfile;
> > > > > > > +	int nb_sections, i;
> > > > > > > +	struct test_configure *test_case;
> > > > > > > +	char section_name[CFG_NAME_LEN];
> > > > > > > +	const char *case_type;
> > > > > > > +	const char *lcore_dma;
> > > > > > > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > > > > > > *kick_batch_str;
> > > > > > > +	int args_nr, nb_vp;
> > > > > > > +	bool is_dma;
> > > > > > > +
> > > > > > > +	printf("config file parsing...\n");
> > > > > > > +	cfgfile = rte_cfgfile_load(path, 0);
> > > > > > > +	if (!cfgfile) {
> > > > > > > +		printf("Open configure file error.\n");
> > > > > > > +		exit(1);
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > > > > > > +	if (nb_sections > MAX_TEST_CASES) {
> > > > > > > +		printf("Error: The maximum number of cases is
> > > %d.\n",
> > > > > > > MAX_TEST_CASES);
> > > > > > > +		exit(1);
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	for (i = 0; i < nb_sections; i++) {
> > > > > > > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i
> > > + 1);
> > > > > > > +		test_case = &test_cases[i];
> > > > > > > +		case_type = rte_cfgfile_get_entry(cfgfile,
> > > section_name,
> > > > > > > "type");
> > > > > > > +		if (!case_type) {
> > > > > > > +			printf("Error: No case type in case %d, the
> > > test will be
> > > > > > > finished here.\n",
> > > > > > > +				i + 1);
> > > > > > > +			test_case->is_valid = false;
> > > > > > > +			continue;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > > > > > > +			test_case->test_type =
> > > > > > > TEST_TYPE_DMA_MEM_COPY;
> > > > > > > +			test_case->test_type_str =
> > > DMA_MEM_COPY;
> > > > > > > +			is_dma = true;
> > > > > > > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > > > > > > +			test_case->test_type =
> > > > > > > TEST_TYPE_CPU_MEM_COPY;
> > > > > > > +			test_case->test_type_str =
> > > CPU_MEM_COPY;
> > > > > > > +			is_dma = false;
> > > > > > > +		} else {
> > > > > > > +			printf("Error: Cannot find case type %s in
> > > case%d.\n",
> > > > > > > case_type, i + 1);
> > > > > > > +			test_case->is_valid = false;
> > > > > > > +			continue;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		nb_vp = 0;
> > > > > > > +
> > > > > > > +		test_case->src_numa_node =
> > > > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > > +
> > > > > > > 	section_name, "src_numa_node"));
> > > > > > > +		test_case->dst_numa_node =
> > > > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > > +
> > > > > > > 	section_name, "dst_numa_node"));
> > > > > > > +
> > > > > > > +		mem_size_str = rte_cfgfile_get_entry(cfgfile,
> > > section_name,
> > > > > > > "mem_size");
> > > > > > > +		args_nr = parse_entry(mem_size_str, &test_case-
> > > > > > > >mem_size);
> > > > > > > +		if (args_nr < 0) {
> > > > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > > > +			test_case->is_valid = false;
> > > > > > > +			continue;
> > > > > > > +		} else if (args_nr > 1)
> > > > > > > +			nb_vp++;
> > > > > > > +
> > > > > > > +		buf_size_str = rte_cfgfile_get_entry(cfgfile,
> > > section_name,
> > > > > > > "buf_size");
> > > > > > > +		args_nr = parse_entry(buf_size_str, &test_case-
> > > >buf_size);
> > > > > > > +		if (args_nr < 0) {
> > > > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > > > +			test_case->is_valid = false;
> > > > > > > +			continue;
> > > > > > > +		} else if (args_nr > 1)
> > > > > > > +			nb_vp++;
> > > > > > > +
> > > > > > > +		if (is_dma) {
> > > > > > > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > > > > section_name,
> > > > > > > +
> > > > > > > 	"dma_ring_size");
> > > > > > > +			args_nr = parse_entry(ring_size_str,
> > > &test_case-
> > > > > > > >ring_size);
> > > > > > > +			if (args_nr < 0) {
> > > > > > > +				printf("parse error in case %d.\n", i +
> > > 1);
> > > > > > > +				test_case->is_valid = false;
> > > > > > > +				continue;
> > > > > > > +			} else if (args_nr > 1)
> > > > > > > +				nb_vp++;
> > > > > > > +
> > > > > > > +			kick_batch_str =
> > > rte_cfgfile_get_entry(cfgfile,
> > > > > > > section_name, "kick_batch");
> > > > > > > +			args_nr = parse_entry(kick_batch_str,
> > > &test_case-
> > > > > > > >kick_batch);
> > > > > > > +			if (args_nr < 0) {
> > > > > > > +				printf("parse error in case %d.\n", i +
> > > 1);
> > > > > > > +				test_case->is_valid = false;
> > > > > > > +				continue;
> > > > > > > +			} else if (args_nr > 1)
> > > > > > > +				nb_vp++;
> > > > > > > +
> > > > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > > > section_name, "lcore_dma");
> > > > > > > +			int lcore_ret = parse_lcore_dma(test_case,
> > > > > > > lcore_dma);
> > > > > > > +			if (lcore_ret < 0) {
> > > > > > > +				printf("parse lcore dma error in case
> > > %d.\n", i
> > > > > > 1);
> > > > > > > +				test_case->is_valid = false;
> > > > > > > +				continue;
> > > > > > > +			}
> > > > > > > +		} else {
> > > > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > > > section_name, "lcore");
> > > > > > > +			int lcore_ret = parse_lcore(test_case,
> > > lcore_dma);
> > > > > > > +			if (lcore_ret < 0) {
> > > > > > > +				printf("parse lcore error in case
> > > %d.\n", i + 1);
> > > > > > > +				test_case->is_valid = false;
> > > > > > > +				continue;
> > > > > > > +			}
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		if (nb_vp > 1) {
> > > > > > > +			printf("Error, each section can only have a
> > > single
> > > > > > > variable parameter.\n");
> > > > > > > +			test_case->is_valid = false;
> > > > > > > +			continue;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		test_case->cache_flush =
> > > > > > > +			(int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > section_name,
> > > > > > > "cache_flush"));
> > > > > > > +		test_case->test_secs =
> > > > > > > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > > +					section_name,
> > > "test_seconds"));
> > > > > > > +
> > > > > > > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > > > > > > section_name, "eal_args");
> > > > > > > +		test_case->is_valid = true;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	rte_cfgfile_close(cfgfile);
> > > > > > > +	printf("config file parsing complete.\n\n");
> > > > > > > +	return i;
> > > > > > > +}
> > > > > > > +
> > > > > > > +/* Parse the argument given in the command line of the
> > > > > > > +application */ static int append_eal_args(int argc, char **argv,
> > > > > > > +const char *eal_args, char **new_argv) {
> > > > > > > +	int i;
> > > > > > > +	char *tokens[MAX_EAL_PARAM_NB];
> > > > > > > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > > > > > > +	int token_nb, new_argc = 0;
> > > > > > > +
> > > > > > > +	for (i = 0; i < argc; i++) {
> > > > > > > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > > > > > > +				(strcmp(argv[i],
> > > CMDLINE_RESULT_ARG) ==
> > > > > > > 0)) {
> > > > > > > +			i++;
> > > > > > > +			continue;
> > > > > > > +		}
> > > > > > > +		strlcpy(new_argv[new_argc], argv[i],
> > > > > > > sizeof(new_argv[new_argc]));
> > > > > > > +		new_argc++;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	if (eal_args) {
> > > > > > > +		strlcpy(args, eal_args, sizeof(args));
> > > > > > > +		token_nb = rte_strsplit(args, strlen(args),
> > > > > > > +					tokens,
> > > MAX_EAL_PARAM_NB, ' ');
> > > > > > > +		for (i = 0; i < token_nb; i++)
> > > > > > > +			strcpy(new_argv[new_argc++], tokens[i]);
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	return new_argc;
> > > > > > > +}
> > > > > > > +
> > > > > > > +int
> > > > > > > +main(int argc, char *argv[])
> > > > > > > +{
> > > > > > > +	int ret;
> > > > > > > +	uint16_t case_nb;
> > > > > > > +	uint32_t i, nb_lcores;
> > > > > > > +	pid_t cpid, wpid;
> > > > > > > +	int wstatus;
> > > > > > > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > > > > > > +	char *pargs[MAX_EAL_PARAM_NB];
> > > > > > > +	char *cfg_path_ptr = NULL;
> > > > > > > +	char *rst_path_ptr = NULL;
> > > > > > > +	char rst_path[PATH_MAX];
> > > > > > > +	int new_argc;
> > > > > > > +	bool is_first_case = true;
> > > > > > > +
> > > > > > > +	memset(args, 0, sizeof(args));
> > > > > > > +
> > > > > > > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > > > > > > +		pargs[i] = args[i];
> > > > > > > +
> > > > > > > +	for (i = 0; i < (uint32_t)argc; i++) {
> > > > > > > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > > > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > > > +			cfg_path_ptr = argv[i + 1];
> > > > > > > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > > > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > > > +			rst_path_ptr = argv[i + 1];
> > > > > > > +	}
> > > > > > > +	if (cfg_path_ptr == NULL) {
> > > > > > > +		printf("Config file not assigned.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +	if (rst_path_ptr == NULL) {
> > > > > > > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > > > > > > +		strcat(strtok(basename(rst_path), "."),
> > > "_result.csv");
> > > > > > > +		rst_path_ptr = rst_path;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	case_nb = load_configs(cfg_path_ptr);
> > > > > > > +	fd = fopen(rst_path_ptr, "w");
> > > > > > > +	if (fd == NULL) {
> > > > > > > +		printf("Open output CSV file error.\n");
> > > > > > > +		return -1;
> > > > > > > +	}
> > > > > > > +	fclose(fd);
> > > > > > > +
> > > > > > > +	for (i = 0; i < case_nb; i++) {
> > > > > > > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > > > > > > +			printf("No test type in test case %d.\n\n", i +
> > > 1);
> > > > > > > +			continue;
> > > > > > > +		}
> > > > > > > +		if (!test_cases[i].is_valid) {
> > > > > > > +			printf("Invalid test case %d.\n\n", i + 1);
> > > > > > > +			continue;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		cpid = fork();
> > > > > >
> > > > > > [Anoob] Do we really need fork()? Can't we use code like,
> > > > > >
> > > > > > 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> > > > > > 			ret |= rte_eal_wait_lcore(lcore_id);
> > > > > > 		}
> > > > > >
> > > > > > to wait for all threads to exit?
> > > > >
> > > > > [Cheng] Good question. Fork() is used here to establish a new
> process
> > > > > for the new test case. In order for each test case to have a new EAL
> > > > > environment (for the flexibility), the EAL must be reinitialized for each
> > > case.
> > > > > However, the EAL parameters can only be initialized once per process.
> > > > > Therefore, we use a new process to run each new test case.
> Moreover,
> > > > > each test case runs sequentially and does not affect the others,
> > > > > ensuring the accuracy of the performance data. Your code would wait
> > > > > for all threads to exit in the same process. However, it would not
> > provide
> > > a
> > > > "clean"
> > > > > environment for each test case like fork() does. Fork() allows us to
> > > > > have a fully reinitialized environment, with no impact or side effects
> > > > > from previous test cases. This results in clean, precise performance
> data
> > > for
> > > > each case.
> > > > >
> > > > > Please let me know your thoughts on this. And please let me know if
> > > > > you have any other questions or require any clarification.
> > > >
> > > > [Anoob] This was just a generic observation. I do not have a strong
> > opinion
> > > > either way.
> > > >
> > >
> > > [Cheng] sure, got it.
> > >
> > > > >
> > > > > Thanks,
> > > > > Cheng
> > > > >
> > > > > >
> > > > > > > +		if (cpid < 0) {
> > > > > > > +			printf("Fork case %d failed.\n", i + 1);
> > > > > > > +			exit(EXIT_FAILURE);
> > > > > > > +		} else if (cpid == 0) {
> > > > > > > +			printf("\nRunning case %u\n\n", i + 1);
> > > > > > > +
> > > > > > > +			new_argc = append_eal_args(argc, argv,
> > > > > > > test_cases[i].eal_args, pargs);
> > > > > > > +			ret = rte_eal_init(new_argc, pargs);
> > > > > > > +			if (ret < 0)
> > > > > > > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > > > > > > arguments\n");
> > > > > > > +
> > > > > > > +			/* Check lcores. */
> > > > > > > +			nb_lcores = rte_lcore_count();
> > > > > > > +			if (nb_lcores < 2)
> > > > > > > +				rte_exit(EXIT_FAILURE,
> > > > > > > +					"There should be at least 2
> > > worker
> > > > > > > lcores.\n");
> > > > > > > +
> > > > > > > +			fd = fopen(rst_path_ptr, "a");
> > > > > > > +			if (!fd) {
> > > > > > > +				printf("Open output CSV file
> > > error.\n");
> > > > > > > +				return 0;
> > > > > > > +			}
> > > > > > > +
> > > > > > > +			if (is_first_case) {
> > > > > > > +				output_env_info();
> > > > > > > +				is_first_case = false;
> > > > > > > +			}
> > > > > > > +			run_test(i + 1, &test_cases[i]);
> > > > > > > +
> > > > > > > +			/* clean up the EAL */
> > > > > > > +			rte_eal_cleanup();
> > > > > > > +
> > > > > > > +			fclose(fd);
> > > > > > > +
> > > > > > > +			printf("\nCase %u completed.\n\n", i + 1);
> > > > > > > +
> > > > > > > +			exit(EXIT_SUCCESS);
> > > > > > > +		} else {
> > > > > > > +			wpid = waitpid(cpid, &wstatus, 0);
> > > > > > > +			if (wpid == -1) {
> > > > > > > +				printf("waitpid error.\n");
> > > > > > > +				exit(EXIT_FAILURE);
> > > > > > > +			}
> > > > > > > +
> > > > > > > +			if (WIFEXITED(wstatus))
> > > > > > > +				printf("Case process exited. status
> > > %d\n\n",
> > > > > > > +					WEXITSTATUS(wstatus));
> > > > > > > +			else if (WIFSIGNALED(wstatus))
> > > > > > > +				printf("Case process killed by signal
> > > %d\n\n",
> > > > > > > +					WTERMSIG(wstatus));
> > > > > > > +			else if (WIFSTOPPED(wstatus))
> > > > > > > +				printf("Case process stopped by
> > > signal
> > > > > > > %d\n\n",
> > > > > > > +					WSTOPSIG(wstatus));
> > > > > > > +			else if (WIFCONTINUED(wstatus))
> > > > > > > +				printf("Case process
> > > continued.\n\n");
> > > > > > > +			else
> > > > > > > +				printf("Case process unknown
> > > > > > > terminated.\n\n");
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	printf("Bye...\n");
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > diff --git a/app/test-dma-perf/main.h b/app/test-dma-
> perf/main.h
> > > > > > > new file mode 100644 index 0000000000..215ac42673
> > > > > > > --- /dev/null
> > > > > > > +++ b/app/test-dma-perf/main.h
> > > > > > > @@ -0,0 +1,69 @@
> > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > > +
> > > > > > > +#ifndef _MAIN_H_
> > > > > > > +#define _MAIN_H_
> > > > > > > +
> > > > > > > +
> > > > > > > +#include <rte_common.h>
> > > > > > > +#include <rte_cycles.h>
> > > > > > > +#include <rte_dev.h>
> > > > > > > +#include <rte_dmadev.h>
> > > > > > > +
> > > > > > > +#ifndef __maybe_unused
> > > > > > > +#define __maybe_unused	__rte_unused
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +#define MAX_WORKER_NB 128
> > > > > > > +#define MAX_OUTPUT_STR_LEN 512
> > > > > > > +
> > > > > > > +#define MAX_DMA_NB 128
> > > > > > > +#define MAX_LCORE_NB 256
> > > > > > > +
> > > > > > > +extern char
> > > output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > > > +
> > > > > > > +typedef enum {
> > > > > > > +	OP_NONE = 0,
> > > > > > > +	OP_ADD,
> > > > > > > +	OP_MUL
> > > > > > > +} alg_op_type;
> > > > > > > +
> > > > > > > +struct test_configure_entry {
> > > > > > > +	uint32_t first;
> > > > > > > +	uint32_t last;
> > > > > > > +	uint32_t incr;
> > > > > > > +	alg_op_type op;
> > > > > > > +	uint32_t cur;
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct lcore_dma_map_t {
> > > > > > > +	uint32_t lcores[MAX_WORKER_NB];
> > > > > > > +	char
> > > dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > > > > > > +	int16_t dma_ids[MAX_WORKER_NB];
> > > > > > > +	uint16_t cnt;
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct test_configure {
> > > > > > > +	bool is_valid;
> > > > > > > +	uint8_t test_type;
> > > > > > > +	const char *test_type_str;
> > > > > > > +	uint16_t src_numa_node;
> > > > > > > +	uint16_t dst_numa_node;
> > > > > > > +	uint16_t opcode;
> > > > > > > +	bool is_dma;
> > > > > > > +	struct lcore_dma_map_t lcore_dma_map;
> > > > > > > +	struct test_configure_entry mem_size;
> > > > > > > +	struct test_configure_entry buf_size;
> > > > > > > +	struct test_configure_entry ring_size;
> > > > > > > +	struct test_configure_entry kick_batch;
> > > > > > > +	uint32_t cache_flush;
> > > > > > > +	uint32_t nr_buf;
> > > > > > > +	uint16_t test_secs;
> > > > > > > +	const char *eal_args;
> > > > > > > +	uint8_t scenario_id;
> > > > > > > +};
> > > > > > > +
> > > > > > > +void mem_copy_benchmark(struct test_configure *cfg, bool
> > > is_dma);
> > > > > > > +
> > > > > > > +#endif /* _MAIN_H_ */
> > > > > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > > > > perf/meson.build new file mode 100644 index
> > > 0000000000..bd6c264002
> > > > > > > --- /dev/null
> > > > > > > +++ b/app/test-dma-perf/meson.build
> > > > > > > @@ -0,0 +1,17 @@
> > > > > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > > > > > +Intel Corporation
> > > > > > > +
> > > > > > > +# meson file, for building this app as part of a main DPDK build.
> > > > > > > +
> > > > > > > +if is_windows
> > > > > > > +    build = false
> > > > > > > +    reason = 'not supported on Windows'
> > > > > > > +    subdir_done()
> > > > > > > +endif
> > > > > > > +
> > > > > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > > > > +
> > > > > > > +sources = files(
> > > > > > > +        'main.c',
> > > > > > > +        'benchmark.c',
> > > > > > > +)
> > > > > > > --
> > > > > > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16  6:32               ` Anoob Joseph
@ 2023-06-16  8:43                 ` Jiang, Cheng1
  2023-06-16  9:48                   ` Anoob Joseph
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-16  8:43 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Friday, June 16, 2023 2:32 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Please see inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Friday, June 16, 2023 8:26 AM
> > To: Anoob Joseph <anoobj@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> application
> >
> > Hi Anoob,
> >
> > Replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Thursday, June 15, 2023 11:48 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> > >
> > > Hi Cheng,
> > >
> > > Please see inline.
> > >
> > > Thanks,
> > > Anoob
> > >
> > > > -----Original Message-----
> > > > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > Sent: Thursday, June 15, 2023 7:36 PM
> > > > To: Anoob Joseph <anoobj@marvell.com>
> > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> > Chenbo
> > > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > > <amitprakashs@marvell.com>; Ma, WenwuX
> <wenwux.ma@intel.com>;
> > > > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > > <xingguang.he@intel.com>
> > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > > >
> > > > Hi Anoob,
> > > >
> > > > Replies are inline.
> > > >
> > > > Thanks,
> > > > Cheng
> > > >
> > > > > -----Original Message-----
> > > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > > Sent: Thursday, June 15, 2023 4:45 PM
> > > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> > > Chenbo
> > > > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > > > <amitprakashs@marvell.com>; Ma, WenwuX
> > <wenwux.ma@intel.com>;
> > > > > Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > > > <xingguang.he@intel.com>
> > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > application
> > > > >
> > > > > Hi Cheng,
> > > > >
> > > > > Please see inline.
> > > > >
> > > > > Thanks,
> > > > > Anoob
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > > > Sent: Thursday, June 15, 2023 1:31 PM
> > > > > > To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> > > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > > mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>;
> > > > Amit
> > > > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > Wang,
> > > > > YuanX
> > > > > > <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> > > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > > application
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > Thanks for your comments, the replies are inline.
> > > > > >
> > > > > > Thanks,
> > > > > > Cheng
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > > > > Sent: Thursday, June 15, 2023 1:22 PM
> > > > > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>;
> > thomas@monjalon.net;
> > > > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > > > mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>;
> > > > > Amit
> > > > > > > Prakash Shukla <amitprakashs@marvell.com>
> > > > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > > Wang,
> > > > > > YuanX
> > > > > > > <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > > > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > > > application
> > > > > > >
> > > > > > > Hi,
> > > > > > >
> > > > > > > Thanks for working on the comments. Few more top level
> comment
> > > > > inline.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Anoob
> > > > > > >
> > > > > > > > -----Original Message-----
> > > > > > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > > > Sent: Tuesday, June 13, 2023 10:02 AM
> > > > > > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > > > > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit
> > Prakash
> > > > > > Shukla
> > > > > > > > <amitprakashs@marvell.com>; Anoob Joseph
> > > > <anoobj@marvell.com>
> > > > > > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > > > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > > > > > xingguang.he@intel.com;
> > > > > > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > > > Subject: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > > > > application
> > > > > > > >
> > > > > > > > External Email
> > > > > > > >
> > > > > > > > ------------------------------------------------------------------
> > > > > > > > --
> > > > > > > > -- There are many high-performance DMA devices supported in
> > > DPDK
> > > > > > > > now, and these DMA devices can also be integrated into other
> > > > > > > > modules of DPDK as accelerators, such as Vhost. Before
> > integrating
> > > > > > > > DMA into applications, developers need to know the
> performance
> > > of
> > > > > > > > these DMA devices in various scenarios and the performance of
> > > CPUs
> > > > > > > > in the same scenario, such as different buffer lengths. Only in
> > > > > > > > this way can we know the target performance of the application
> > > > > > > > accelerated by using them. This patch introduces a
> > > > > > > > high-performance testing tool, which supports comparing the
> > > > > > > > performance of CPU and DMA in different scenarios
> automatically
> > > > > > > > with a pre- set config file. Memory Copy performance test are
> > > > > > > supported for now.
> > > > > > > >
> > > > > > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > > > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > > > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > > > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > > > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > > > > > ---
> > > > > > > > v6:
> > > > > > > >   improved code based on Anoob's comments;
> > > > > > > >   fixed some code structure issues;
> > > > > > > > v5:
> > > > > > > >   fixed some LONG_LINE warnings;
> > > > > > > > v4:
> > > > > > > >   fixed inaccuracy of the memory footprint display;
> > > > > > > > v3:
> > > > > > > >   fixed some typos;
> > > > > > > > v2:
> > > > > > > >   added lcore/dmadev designation;
> > > > > > > >   added error case process;
> > > > > > > >   removed worker_threads parameter from config.ini;
> > > > > > > >   improved the logs;
> > > > > > > >   improved config file;
> > > > > > > >
> > > > > > > >  app/meson.build               |   1 +
> > > > > > > >  app/test-dma-perf/benchmark.c | 477
> > > > > > ++++++++++++++++++++++++++++
> > > > > > > > app/test-dma-perf/config.ini  |  59 ++++
> > > > > > > >  app/test-dma-perf/main.c      | 569
> > > > > > > > ++++++++++++++++++++++++++++++++++
> > > > > > > >  app/test-dma-perf/main.h      |  69 +++++
> > > > > > > >  app/test-dma-perf/meson.build |  17 +
> > > > > > > >  6 files changed, 1192 insertions(+)  create mode 100644
> > > > > > > > app/test-dma-perf/benchmark.c  create mode 100644
> > > > > > > > app/test-dma-perf/config.ini  create mode 100644 app/test-
> dma-
> > > > > > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h
> > create
> > > > > > > > mode
> > > > > > > > 100644 app/test-dma-perf/meson.build
> > > > > > > >
> > > > > > > > diff --git a/app/meson.build b/app/meson.build index
> > > > > > > > 74d2420f67..4fc1a83eba 100644
> > > > > > > > --- a/app/meson.build
> > > > > > > > +++ b/app/meson.build
> > > > > > > > @@ -19,6 +19,7 @@ apps = [
> > > > > > > >          'test-cmdline',
> > > > > > > >          'test-compress-perf',
> > > > > > > >          'test-crypto-perf',
> > > > > > > > +        'test-dma-perf',
> > > > > > > >          'test-eventdev',
> > > > > > > >          'test-fib',
> > > > > > > >          'test-flow-perf',
> > > > > > > > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> > > > > > > > perf/benchmark.c new file mode 100644 index
> > > > 0000000000..bc1ca82297
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/app/test-dma-perf/benchmark.c
> > > > > > > > @@ -0,0 +1,477 @@
> > > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > > > +
> > > > > > > > +#include <inttypes.h>
> > > > > > > > +#include <stdio.h>
> > > > > > > > +#include <stdlib.h>
> > > > > > > > +#include <unistd.h>
> > > > > > > > +
> > > > > > > > +#include <rte_time.h>
> > > > > > > > +#include <rte_mbuf.h>
> > > > > > > > +#include <rte_dmadev.h>
> > > > > > > > +#include <rte_malloc.h>
> > > > > > > > +#include <rte_lcore.h>
> > > > > > > > +
> > > > > > > > +#include "main.h"
> > > > > > > > +
> > > > > > > > +#define MAX_DMA_CPL_NB 255
> > > > > > > > +
> > > > > > > > +#define TEST_WAIT_U_SECOND 10000
> > > > > > > > +
> > > > > > > > +#define CSV_LINE_DMA_FMT "Scenario
> > %u,%u,%s,%u,%u,%.2lf,%"
> > > > > > PRIu64
> > > > > > > > ",%.3lf,%.3lf\n"
> > > > > > > > +#define CSV_LINE_CPU_FMT "Scenario
> > %u,%u,NA,%u,%u,%.2lf,%"
> > > > > > PRIu64
> > > > > > > > ",%.3lf,%.3lf\n"
> > > > > > > > +
> > > > > > > > +struct worker_info {
> > > > > > > > +	bool ready_flag;
> > > > > > > > +	bool start_flag;
> > > > > > > > +	bool stop_flag;
> > > > > > > > +	uint32_t total_cpl;
> > > > > > > > +	uint32_t test_cpl;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +struct lcore_params {
> > > > > > > > +	uint8_t scenario_id;
> > > > > > > > +	unsigned int lcore_id;
> > > > > > > > +	char *dma_name;
> > > > > > > > +	uint16_t worker_id;
> > > > > > > > +	uint16_t dev_id;
> > > > > > > > +	uint32_t nr_buf;
> > > > > > > > +	uint16_t kick_batch;
> > > > > > > > +	uint32_t buf_size;
> > > > > > > > +	uint16_t test_secs;
> > > > > > > > +	struct rte_mbuf **srcs;
> > > > > > > > +	struct rte_mbuf **dsts;
> > > > > > > > +	struct worker_info worker_info;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +static struct rte_mempool *src_pool; static struct
> rte_mempool
> > > > > > > > +*dst_pool;
> > > > > > > > +
> > > > > > > > +static volatile struct lcore_params
> > > > > > *worker_params[MAX_WORKER_NB];
> > > > > > > > +
> > > > > > > > +#define PRINT_ERR(...) print_err(__func__, __LINE__,
> > > > __VA_ARGS__)
> > > > > > > > +
> > > > > > > > +static inline int
> > > > > > > > +__rte_format_printf(3, 4)
> > > > > > > > +print_err(const char *func, int lineno, const char *format, ...) {
> > > > > > > > +	va_list ap;
> > > > > > > > +	int ret;
> > > > > > > > +
> > > > > > > > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > > > > > > > +	va_start(ap, format);
> > > > > > > > +	ret += vfprintf(stderr, format, ap);
> > > > > > > > +	va_end(ap);
> > > > > > > > +
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static inline void
> > > > > > > > +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t
> > > > > > > > +nb_workers,
> > > > > > > > uint16_t test_secs,
> > > > > > > > +				uint32_t total_cnt, float *memory,
> > > > uint32_t
> > > > > > > > *ave_cycle,
> > > > > > > > +				float *bandwidth, float *mops) {
> > > > > > > > +	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) /
> > > > (1024
> > > > > > > > +*
> > > > > > > > 1024);
> > > > > > > > +	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
> > > > > > > > +	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() /
> > > > > > > > (float)*ave_cycle)) / 1000000000;
> > >
> > > [Anoob] The above calculation may not yield actual results. 'ave_cycle'
> > would
> > > get converted to integer and then bandwidth would be allowed to report
> > > only very few values. Instead, we can do the calculation directly like,
> > >
> > > 	*bandwidth = ((float)buf_size * 8 * total_cnt / test_secs) /
> > > 1000000000;
> > > 	*mops = (float)total_cnt / test_secs / 1000000;
> > >
> > > Same issue is there with below calculation as well. Please check.
> >
> > [Cheng] Yes, I've noticed as well. Dengdui also mentioned this in his
> > comments. I will address this issue in v7. Thank you very much.
> >
> > >
> > > Side note: in bandwidth calculation, shouldn't we be dividing by
> > > 1024*1024*1024? I've just carried the calculation that you used. Feel free
> to
> > > correct as required.
> >
> > [Cheng] The unit I'm using in my calculations is Gb/s (Gigabits per second),
> > which is based on the decimal system. Therefore, I use the factor of 1000^3
> > (or 1,000,000,000).
> > The method you mentioned, dividing by 1024^3, is typically used when
> > calculating GiB/s (Gibibits per second), a binary-based unit.
> > I think both methods are acceptable as long as the units and calculation
> > methods correspond.
> > What do you think?
> 
> [Anoob] You are right. Existing logic is correct. Thanks for the explanation.

[Cheng] sure, no problem. Feel free to contact me if you have any more questions.

> 
> >
> > >
> > > > > > > > +	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000; }
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +output_result(uint8_t scenario_id, uint32_t lcore_id, char
> > > > > > > > +*dma_name,
> > > > > > > > uint64_t ave_cycle,
> > > > > > > > +			uint32_t buf_size, uint32_t nr_buf, float
> > > > memory,
> > > > > > > > +			float bandwidth, float mops, bool is_dma) {
> > > > > > > > +	if (is_dma)
> > > > > > > > +		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
> > > > > > > > +	else
> > > > > > > > +		printf("lcore %u\n", lcore_id);
> > > > > > > > +
> > > > > > > > +	printf("average cycles/op: %" PRIu64 ", buffer size: %u,
> > > > nr_buf:
> > > > > > > > +%u,
> > > > > > > > memory: %.2lfMB, frequency: %" PRIu64 ".\n",
> > > > > > > > +			ave_cycle, buf_size, nr_buf, memory,
> > > > > > > > rte_get_timer_hz());
> > > > > > > > +	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n",
> > > > bandwidth,
> > > > > > > > +mops);
> > > > > > > > +
> > > > > > > > +	if (is_dma)
> > > > > > > > +		snprintf(output_str[lcore_id],
> > > > MAX_OUTPUT_STR_LEN,
> > > > > > > > CSV_LINE_DMA_FMT,
> > > > > > > > +			scenario_id, lcore_id, dma_name, buf_size,
> > > > > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> > > > mops);
> > > > > > > > +	else
> > > > > > > > +		snprintf(output_str[lcore_id],
> > > > MAX_OUTPUT_STR_LEN,
> > > > > > > > CSV_LINE_CPU_FMT,
> > > > > > > > +			scenario_id, lcore_id, buf_size,
> > > > > > > > +			nr_buf, memory, ave_cycle, bandwidth,
> > > > mops); }
> > > > > > > > +
> > > > > > > > +static inline void
> > > > > > > > +cache_flush_buf(__maybe_unused struct rte_mbuf **array,
> > > > > > > > +		__maybe_unused uint32_t buf_size,
> > > > > > > > +		__maybe_unused uint32_t nr_buf) { #ifdef
> > > > > RTE_ARCH_X86_64
> > > > > > > > +	char *data;
> > > > > > > > +	struct rte_mbuf **srcs = array;
> > > > > > > > +	uint32_t i, offset;
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < nr_buf; i++) {
> > > > > > > > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > > > > > > > +		for (offset = 0; offset < buf_size; offset += 64)
> > > > > > > > +			__builtin_ia32_clflush(data + offset);
> > > > > > > > +	}
> > > > > > > > +#endif
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/* Configuration of device. */
> > > > > > > > +static void
> > > > > > > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
> {
> > > > > > > > +	uint16_t vchan = 0;
> > > > > > > > +	struct rte_dma_info info;
> > > > > > > > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > > > > > > > +	struct rte_dma_vchan_conf qconf = {
> > > > > > > > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > > > > > > > +		.nb_desc = ring_size
> > > > > > > > +	};
> > > > > > > > +
> > > > > > > > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > > > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma
> > > > configure.\n");
> > > > > > > > +
> > > > > > > > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > > > > > > > +		rte_exit(EXIT_FAILURE, "Error with queue
> > > > configuration.\n");
> > > > > > > > +
> > > > > > > > +	rte_dma_info_get(dev_id, &info);
> > > > > > > > +	if (info.nb_vchans != 1)
> > > > > > > > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > > > > > > > reported on device id. %u\n",
> > > > > > > > +				dev_id);
> > > > > > > > +
> > > > > > > > +	if (rte_dma_start(dev_id) != 0)
> > > > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > > > > > > > +
> > > > > > > > +static int
> > > > > > > > +config_dmadevs(struct test_configure *cfg) {
> > > > > > > > +	uint32_t ring_size = cfg->ring_size.cur;
> > > > > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > > > > +	uint32_t nb_workers = ldm->cnt;
> > > > > > > > +	uint32_t i;
> > > > > > > > +	int dev_id;
> > > > > > > > +	uint16_t nb_dmadevs = 0;
> > > > > > > > +	char *dma_name;
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < ldm->cnt; i++) {
> > > > > > > > +		dma_name = ldm->dma_names[i];
> > > > > > > > +		dev_id =
> > > > rte_dma_get_dev_id_by_name(dma_name);
> > > > > > > > +		if (dev_id == -1) {
> > > > > > > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > > > > > > dma_name);
> > > > > > > > +			goto end;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		ldm->dma_ids[i] = dev_id;
> > > > > > > > +		configure_dmadev_queue(dev_id, ring_size);
> > > > > > > > +		++nb_dmadevs;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +end:
> > > > > > > > +	if (nb_dmadevs < nb_workers) {
> > > > > > > > +		printf("Not enough dmadevs (%u) for all workers
> > > > (%u).\n",
> > > > > > > > nb_dmadevs, nb_workers);
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > > > > > > +
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +#define POLL_MAX 1000
> > > > > > > > +
> > > > > > > > +
> > > > > > >
> > > > > > > [Anoob] Extra blank line. You can consider removing.
> > > > > >
> > > > > > [Cheng] sure, sorry for the miss.
> > > > > >
> > > > > > >
> > > > > > > > +static inline void
> > > > > > > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t
> > *async_cnt,
> > > > > > > > +			volatile struct worker_info *worker_info) {
> > > > > > > > +	int ret;
> > > > > > > > +	uint16_t nr_cpl;
> > > > > > > > +
> > > > > > > > +	ret = rte_dma_submit(dev_id, 0);
> > > > > > > > +	if (ret < 0) {
> > > > > > > > +		rte_dma_stop(dev_id);
> > > > > > > > +		rte_dma_close(dev_id);
> > > > > > > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	nr_cpl = rte_dma_completed(dev_id, 0,
> > > > MAX_DMA_CPL_NB, NULL,
> > > > > > > > NULL);
> > > > > > > > +	*async_cnt -= nr_cpl;
> > > > > > > > +	worker_info->total_cpl += nr_cpl; }
> > > > > > > > +
> > > > > > > > +static inline int
> > > > > > > > +do_dma_mem_copy(void *p)
> > > > > > > > +{
> > > > > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > > > > +	volatile struct lcore_params *para =
> > > > worker_params[*para_idx];
> > > > > > > > +	volatile struct worker_info *worker_info = &(para-
> > > > >worker_info);
> > > > > > > > +	const uint16_t dev_id = para->dev_id;
> > > > > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > > > > +	const uint16_t kick_batch = para->kick_batch;
> > > > > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > > > > +	uint16_t nr_cpl;
> > > > > > > > +	uint64_t async_cnt = 0;
> > > > > > > > +	uint32_t i;
> > > > > > > > +	uint32_t poll_cnt = 0;
> > > > > > > > +	int ret;
> > > > > > > > +
> > > > > > > > +	worker_info->stop_flag = false;
> > > > > > > > +	worker_info->ready_flag = true;
> > > > > > > > +
> > > > > > > > +	while (!worker_info->start_flag)
> > > > > > > > +		;
> > > > > > > > +
> > > > > > > > +	while (1) {
> > > > > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > > > > +dma_copy:
> > > > > > > > +			ret = rte_dma_copy(dev_id, 0,
> > > > > > > > rte_pktmbuf_iova(srcs[i]),
> > > > > > > > +				rte_pktmbuf_iova(dsts[i]), buf_size,
> > > > 0);
> > > > > > > > +			if (unlikely(ret < 0)) {
> > > > > > > > +				if (ret == -ENOSPC) {
> > > > > > > > +
> > > > 	do_dma_submit_and_poll(dev_id,
> > > > > > > > &async_cnt, worker_info);
> > > > > > > > +					goto dma_copy;
> > > > > > > > +				} else {
> > > > > > > > +					/* Error exit */
> > > > > > > > +					rte_dma_stop(dev_id);
> > > > > > > > +					rte_exit(EXIT_FAILURE,
> > > > "DMA
> > > > > > > > enqueue failed\n");
> > > > > > > > +				}
> > > > > > > > +			}
> > > > > > > > +			async_cnt++;
> > > > > > > > +
> > > > > > > > +			if ((async_cnt % kick_batch) == 0)
> > > > > > > > +				do_dma_submit_and_poll(dev_id,
> > > > > > > > &async_cnt, worker_info);
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		if (worker_info->stop_flag)
> > > > > > > > +			break;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	rte_dma_submit(dev_id, 0);
> > > > > > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > > > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > > > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > > > > > +		async_cnt -= nr_cpl;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static inline int
> > > > > > > > +do_cpu_mem_copy(void *p)
> > > > > > > > +{
> > > > > > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > > > > > +	volatile struct lcore_params *para =
> > > > worker_params[*para_idx];
> > > > > > > > +	volatile struct worker_info *worker_info = &(para-
> > > > >worker_info);
> > > > > > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > > > > > +	const uint32_t buf_size = para->buf_size;
> > > > > > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > > > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > > > > > +	uint32_t i;
> > > > > > > > +
> > > > > > > > +	worker_info->stop_flag = false;
> > > > > > > > +	worker_info->ready_flag = true;
> > > > > > > > +
> > > > > > > > +	while (!worker_info->start_flag)
> > > > > > > > +		;
> > > > > > > > +
> > > > > > > > +	while (1) {
> > > > > > > > +		for (i = 0; i < nr_buf; i++) {
> > > > > > > > +			/* copy buffer form src to dst */
> > > > > > > > +			rte_memcpy((void
> > > > > > > > *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
> > > > > > > > +				(void
> > > > > > > > *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
> > > > > > > > +				(size_t)buf_size);
> > > > > > > > +			worker_info->total_cpl++;
> > > > > > > > +		}
> > > > > > > > +		if (worker_info->stop_flag)
> > > > > > > > +			break;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int
> > > > > > > > +setup_memory_env(struct test_configure *cfg, struct
> rte_mbuf
> > > > > > ***srcs,
> > > > > > > > +			struct rte_mbuf ***dsts)
> > > > > > > > +{
> > > > > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > > > > +	unsigned int nr_sockets;
> > > > > > > > +	uint32_t nr_buf = cfg->nr_buf;
> > > > > > > > +
> > > > > > > > +	nr_sockets = rte_socket_count();
> > > > > > > > +	if (cfg->src_numa_node >= nr_sockets ||
> > > > > > > > +		cfg->dst_numa_node >= nr_sockets) {
> > > > > > > > +		printf("Error: Source or destination numa exceeds
> > > > the acture
> > > > > > > > numa nodes.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	src_pool =
> > > > rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > > > > > > > +			nr_buf, /* n == num elements */
> > > > > > > > +			64,  /* cache size */
> > > > > > > > +			0,   /* priv size */
> > > > > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > > > > +			cfg->src_numa_node);
> > > > > > > > +	if (src_pool == NULL) {
> > > > > > > > +		PRINT_ERR("Error with source mempool
> > > > creation.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	dst_pool =
> > > > rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > > > > > > > +			nr_buf, /* n == num elements */
> > > > > > > > +			64,  /* cache size */
> > > > > > >
> > > > > > > [Anoob] We do not alloc or free pointers in the datapath, right? So
> > > > > > > why bother with cache?
> > > > > >
> > > > > > [Cheng] Yes, you are right, the cache size is not necessary here, I'll
> > > > > > fix it in the next version.
> > > > > >
> > > > > > >
> > > > > > > > +			0,   /* priv size */
> > > > > > > > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > > > > > > > +			cfg->dst_numa_node);
> > > > > > > > +	if (dst_pool == NULL) {
> > > > > > > > +		PRINT_ERR("Error with destination mempool
> > > > creation.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> > > > 0);
> > > > > > > > +	if (*srcs == NULL) {
> > > > > > > > +		printf("Error: srcs malloc failed.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > >
> > > > > > > [Anoob] Are we freeing these memory? The ones allocated with
> > > > > > rte_malloc.
> > > > > >
> > > > > > [Cheng] yes, we freed the memory in the end of
> > > > mem_copy_benchmark()
> > > > > > when we finished the test.
> > > > >
> > > > > [Anoob] I think we are not freeing this mem. In the place where we
> > free
> > > all
> > > > > mem, we do free all objects to mempool as well as the mempools.
> But
> > > this
> > > > > memory is to hold the pointers, right? Is that getting freed anywhere?
> > > > >
> > > > > Also, in the mem clearing paths, do we need to clear the static
> variables
> > > (ie,
> > > > > set srcs, src_pool, dsts, dst_pool to NULL) so that there won't be any
> > > scope
> > > > > for any double free.
> > > > >
> > > >
> > > > [Cheng] My apologies for the misunderstanding earlier. I now
> understand
> > > > your point that you are right, the memory used to store the pointers is
> > not
> > > > being freed. I will fix this issue in the next version. Regarding the static
> > > > variables you mentioned, I agree with your view that they should be
> > > cleared.
> > > > I will address this in the upcoming version as well. Thank you very much
> > for
> > > > the feedback. It is greatly appreciated.
> > > >
> > > > In addition, I think we also need to nullify these variables when
> initializing
> > > > them to ensure safety and standardization of use. What do you think?
> > >
> > > [Anoob] Since these are static variables, it is probably okay to skip the init
> > > part. But when we use it, we should clear it after use.
> > >
> > > Please check above. I've posted one more comment. In case you missed.
> > >
> >
> > [Cheng] sure, thanks for your advice, I'll clear it after use in the next version,
> > thanks.
> >
> > > >
> > > > Thanks!
> > > >
> > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *),
> > > > 0);
> > > > > > > > +	if (*dsts == NULL) {
> > > > > > > > +		printf("Error: dsts malloc failed.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf)
> > > > != 0) {
> > > > > > > > +		printf("get src mbufs failed.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf)
> > > > != 0) {
> > > > > > > > +		printf("get dst mbufs failed.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +mem_copy_benchmark(struct test_configure *cfg, bool
> is_dma)
> > {
> > > > > > > > +	uint16_t i;
> > > > > > > > +	uint32_t offset;
> > > > > > > > +	unsigned int lcore_id = 0;
> > > > > > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > > > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > > > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > > > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > > > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > > > > > +1024) /
> > > > > > > > (cfg->buf_size.cur * 2);
> > > > > > > > +	uint16_t nb_workers = ldm->cnt;
> > > > > > > > +	uint16_t test_secs = cfg->test_secs;
> > > > > > > > +	float memory;
> > > > > > > > +	uint32_t avg_cycles = 0;
> > > > > > > > +	float mops;
> > > > > > > > +	float bandwidth;
> > > > > > > > +
> > > > > > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	if (is_dma)
> > > > > > > > +		if (config_dmadevs(cfg) < 0)
> > > > > > > > +			goto out;
> > > > > > > > +
> > > > > > > > +	if (cfg->cache_flush) {
> > > > > > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > > > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > > > > > +		rte_mb();
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	printf("Start testing....\n");
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > > > > +		lcore_id = ldm->lcores[i];
> > > > > > > > +		offset = nr_buf / nb_workers * i;
> > > > > > > > +
> > > > > > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > > > > > lcore_params), 0);
> > > > > > > > +		if (!worker_params[i]) {
> > > > > > > > +			printf("lcore parameters malloc failure for
> > > > lcore
> > > > > > > > %d\n", lcore_id);
> > > > > > > > +			break;
> > > > > > > > +		}
> > > > > > >
> > > > > > > [Anoob] Are we freeing the above memory?
> > > > > >
> > > > > > [Cheng] sorry, I missed that, I'll add worker_params memory free in
> > > > > > the next version, thanks.
> > > > > >
> > > > > > >
> > > > > > > > +		if (is_dma) {
> > > > > > > > +			worker_params[i]->dma_name = ldm-
> > > > > > > > >dma_names[i];
> > > > > > > > +			worker_params[i]->dev_id = ldm-
> > > > >dma_ids[i];
> > > > > > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > > > > > +		}
> > > > > > > > +		worker_params[i]->worker_id = i;
> > > > > > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > > > > > nb_workers);
> > > > > > > > +		worker_params[i]->buf_size = buf_size;
> > > > > > > > +		worker_params[i]->test_secs = test_secs;
> > > > > > > > +		worker_params[i]->srcs = srcs + offset;
> > > > > > > > +		worker_params[i]->dsts = dsts + offset;
> > > > > > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > > > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > > > > > +
> > > > > > > > +		if (is_dma)
> > > > > > > > +
> > > > 	rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > > > > > *)(&i), lcore_id);
> > > > > > > > +		else
> > > > > > > > +			rte_eal_remote_launch(do_cpu_mem_copy,
> > > > (void
> > > > > > > > *)(&i), lcore_id);
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	while (1) {
> > > > > > > > +		bool ready = true;
> > > > > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > > > > +			if (worker_params[i]-
> > > > >worker_info.ready_flag ==
> > > > > > > > false) {
> > > > > > > > +				ready = 0;
> > > > > > > > +				break;
> > > > > > > > +			}
> > > > > > > > +		}
> > > > > > > > +		if (ready)
> > > > > > > > +			break;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > > > > > +
> > > > > > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > > > > +worker_params[i]->worker_info.total_cpl;
> > > > > > > > +
> > > > > > > > +	usleep(test_secs * 1000 * 1000);
> > > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > > +		worker_params[i]->worker_info.test_cpl =
> > > > > > > > worker_params[i]->worker_info.total_cpl -
> > > > > > > > +						worker_params[i]-
> > > > > > > > >worker_info.test_cpl;
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < nb_workers; i++)
> > > > > > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > > > > > +
> > > > > > > > +	rte_eal_mp_wait_lcore();
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < nb_workers; i++) {
> > > > > > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > > > > > +			worker_params[i]->worker_info.test_cpl,
> > > > > > > > +			&memory, &avg_cycles, &bandwidth,
> > > > &mops);
> > > > > > > > +		output_result(cfg->scenario_id, worker_params[i]-
> > > > >lcore_id,
> > > > > > > > +					worker_params[i]-
> > > > >dma_name,
> > > > > > > > avg_cycles, buf_size,
> > > > > > > > +					nr_buf / nb_workers,
> > > > memory,
> > > > > > > > bandwidth, mops, is_dma);
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +out:
> > > > > > > > +	/* free env */
> > > > > > > > +	if (srcs)
> > > > > > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > > > > > +	if (dsts)
> > > > > > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > > > > > +
> > > > > > > > +	if (src_pool)
> > > > > > > > +		rte_mempool_free(src_pool);
> > > > > > > > +	if (dst_pool)
> > > > > > > > +		rte_mempool_free(dst_pool);
> > > > > > > > +
> > > > > > > > +	if (is_dma) {
> > > > > > > > +		for (i = 0; i < nb_workers; i++) {
> > > > > > > > +			printf("Stopping dmadev %d\n", ldm-
> > > > >dma_ids[i]);
> > > > > > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > > > > > > > +		}
> > > > > > > > +	}
> > > > > > > > +}
> > > > > > > > diff --git a/app/test-dma-perf/config.ini
> > > > > > > > b/app/test-dma-perf/config.ini new file mode 100644 index
> > > > > > > > 0000000000..2fd9c3c387
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/app/test-dma-perf/config.ini
> > > > > > > > @@ -0,0 +1,59 @@
> > > > > > > > +
> > > > > > > > +; This is an example configuration file for dma-perf, which
> > > > > > > > +details the meanings of each parameter ; and instructions on
> > how
> > > > > > > > +to use dma-
> > > > > > perf.
> > > > > > > > +
> > > > > > > > +; Supported test types are DMA_MEM_COPY and
> > CPU_MEM_COPY.
> > > > > > > > +
> > > > > > > > +; Parameters:
> > > > > > > > +; "mem_size" denotes the size of the memory footprint.
> > > > > > > > +; "buf_size" denotes the memory size of a single operation.
> > > > > > > > +; "dma_ring_size" denotes the dma ring buffer size. It should
> be
> > > > > > > > +greater
> > > > > > > > than 64 normally.
> > > > > > > > +; "kick_batch" denotes the dma operation batch size, and
> should
> > > > > > > > +be greater
> > > > > > > > than 1 normally.
> > > > > > > > +
> > > > > > > > +; The format for variables is
> > variable=first,last,increment,ADD|MUL.
> > > > > > > > +
> > > > > > > > +; src_numa_node is used to control the numa node where the
> > > > source
> > > > > > > > memory is allocated.
> > > > > > > > +; dst_numa_node is used to control the numa node where the
> > > > > > > > +destination
> > > > > > > > memory is allocated.
> > > > > > > > +
> > > > > > > > +; cache_flush is used to determine whether or not the cache
> > > > > > > > +should be flushed, with 1 indicating to ; flush and 0 indicating to
> > > not
> > > > > flush.
> > > > > > > > +
> > > > > > > > +; test_seconds controls the test time of the whole case.
> > > > > > > > +
> > > > > > > > +; To use DMA for a test, please specify the "lcore_dma"
> > parameter.
> > > > > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > > > > +; make sure that the value of "lcore_dma" falls within their
> > > > > > > > +range of the
> > > > > > > > values.
> > > > > > > > +
> > > > > > > > +; To use CPU for a test, please specify the "lcore" parameter.
> > > > > > > > +; If you have already set the "-l" and "-a" parameters using EAL,
> > > > > > > > +; make sure that the value of "lcore" falls within their range of
> > > > values.
> > > > > > > > +
> > > > > > > > +; To specify a configuration file, use the "--config" flag
> > > > > > > > +followed by the path
> > > > > > > > to the file.
> > > > > > > > +
> > > > > > > > +; To specify a result file, use the "--result" flag followed by
> > > > > > > > +the path to the
> > > > > > > > file.
> > > > > > > > +; If you do not specify a result file, one will be generated with
> > > > > > > > +the same name as the configuration ; file, with the addition of
> > > > > > > > +"_result.csv" at
> > > > > > > > the end.
> > > > > > > > +
> > > > > > > > +[case1]
> > > > > > > > +type=DMA_MEM_COPY
> > > > > > > > +mem_size=10
> > > > > > > > +buf_size=64,8192,2,MUL
> > > > > > > > +dma_ring_size=1024
> > > > > > > > +kick_batch=32
> > > > > > > > +src_numa_node=0
> > > > > > > > +dst_numa_node=0
> > > > > > > > +cache_flush=0
> > > > > > > > +test_seconds=2
> > > > > > > > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > > > > > >
> > > > > > > [Anoob] Isn't it better if we allow user to specify DMA dev ID
> > > > > > > rather than the PCI DBDF?
> > > > > > >
> > > > > > > In the long run, I would expect config file to provide {core,
> > > > > > > dma_dev_id, queue_id}
> > > > > > >
> > > > > > > Another thought is why to expose this at all? If we can restrict
> > > > > > > this perf application to have one thread only use one vchan, then
> > > > > > > application can easily create this mapping in run time. Unless you
> > > > > > > want one thread to use 2 different vchans which may not be
> > desirable
> > > > > > since this is a standalone perf app.
> > > > > >
> > > > > > [Cheng] Thank you for the feedback.
> > > > > > Here are my thoughts:
> > > > > > Firstly, the user may not know which device the DMA dev ID
> > > corresponds
> > > > > > to, or which NUMA node it is on. In my example, I used the CBDMA
> > > > > > environment, so I did not specify the work queue ID. When using
> > DSA,
> > > > > > the configuration would be something like lcore10@0000:00:04.2-q0
> > > > > > which contains core, dma and work queue id. The reason for
> exposing
> > > > > > these options is that we want the user to fully understand which
> > cores
> > > > > > and devices are being used so that they know exactly where the
> > > > > > performance data is coming from. For example, performance when
> > > cores
> > > > > > and DMA devices are not on the same NUMA node, etc. This allows
> > the
> > > > > > testing scenario to be precise and flexible. If the application
> > > > > > handles the mapping itself, the user loses control over the mapping
> > > > > > and may not get the performance data they want. We believe
> control
> > > > > > should be given to the user rather than the application.
> > > > >
> > > > > [Anoob] I understand your view points. Thanks for the explanation.
> > > > >
> > > >
> > > > [Cheng] sure, no problem.
> > > >
> > > > > >
> > > > > > >
> > > > > > > > +eal_args=--in-memory --file-prefix=test
> > > > > > > > +
> > > > > > > > +[case2]
> > > > > > > > +type=CPU_MEM_COPY
> > > > > > > > +mem_size=10
> > > > > > > > +buf_size=64,8192,2,MUL
> > > > > > > > +src_numa_node=0
> > > > > > > > +dst_numa_node=1
> > > > > > > > +cache_flush=0
> > > > > > > > +test_seconds=2
> > > > > > > > +lcore = 3, 4
> > > > > > > > +eal_args=--in-memory --no-pci
> > > > > > > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-
> > perf/main.c
> > > > > > > > new file mode 100644 index 0000000000..d65655b87b
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/app/test-dma-perf/main.c
> > > > > > > > @@ -0,0 +1,569 @@
> > > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > > > +
> > > > > > > > +#include <stdio.h>
> > > > > > > > +#include <stdlib.h>
> > > > > > > > +#include <getopt.h>
> > > > > > > > +#include <signal.h>
> > > > > > > > +#include <stdbool.h>
> > > > > > > > +#include <unistd.h>
> > > > > > > > +#include <sys/wait.h>
> > > > > > > > +#include <inttypes.h>
> > > > > > > > +#include <libgen.h>
> > > > > > > > +
> > > > > > > > +#include <rte_eal.h>
> > > > > > > > +#include <rte_cfgfile.h>
> > > > > > > > +#include <rte_string_fns.h>
> > > > > > > > +#include <rte_lcore.h>
> > > > > > > > +
> > > > > > > > +#include "main.h"
> > > > > > > > +
> > > > > > > > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> > > > > > > > size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
> > > > > > > > +
> > > > > > > > +#define MAX_EAL_PARAM_NB 100
> > > > > > > > +#define MAX_EAL_PARAM_LEN 1024
> > > > > > > > +
> > > > > > > > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > > > > > > > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > > > > > > > +
> > > > > > > > +#define CMDLINE_CONFIG_ARG "--config"
> > > > > > > > +#define CMDLINE_RESULT_ARG "--result"
> > > > > > > > +
> > > > > > > > +#define MAX_PARAMS_PER_ENTRY 4
> > > > > > > > +
> > > > > > > > +#define MAX_LONG_OPT_SZ 64
> > > > > > > > +
> > > > > > > > +enum {
> > > > > > > > +	TEST_TYPE_NONE = 0,
> > > > > > > > +	TEST_TYPE_DMA_MEM_COPY,
> > > > > > > > +	TEST_TYPE_CPU_MEM_COPY
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +#define MAX_TEST_CASES 16
> > > > > > > > +static struct test_configure test_cases[MAX_TEST_CASES];
> > > > > > > > +
> > > > > > > > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > > > > +
> > > > > > > > +static FILE *fd;
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +output_csv(bool need_blankline)
> > > > > > > > +{
> > > > > > > > +	uint32_t i;
> > > > > > > > +
> > > > > > > > +	if (need_blankline) {
> > > > > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > > > > +		fprintf(fd, ",,,,,,,,\n");
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > > > > > > > +		if (output_str[i][0]) {
> > > > > > > > +			fprintf(fd, "%s", output_str[i]);
> > > > > > > > +			output_str[i][0] = '\0';
> > > > > > > > +		}
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	fflush(fd);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +output_env_info(void)
> > > > > > > > +{
> > > > > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test
> > > > > > > > environment:\n");
> > > > > > > > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU
> > > > frequency,%"
> > > > > > > > +			PRIu64 "\n", rte_get_timer_hz());
> > > > > > > > +
> > > > > > > > +	output_csv(true);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +output_header(uint32_t case_id, struct test_configure
> > *case_cfg) {
> > > > > > > > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > > > > > > > +			CSV_HDR_FMT, case_id, case_cfg-
> > > > >test_type_str);
> > > > > > > > +
> > > > > > > > +	output_csv(true);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +run_test_case(struct test_configure *case_cfg) {
> > > > > > > > +	switch (case_cfg->test_type) {
> > > > > > > > +	case TEST_TYPE_DMA_MEM_COPY:
> > > > > > > > +		mem_copy_benchmark(case_cfg, true);
> > > > > > > > +		break;
> > > > > > > > +	case TEST_TYPE_CPU_MEM_COPY:
> > > > > > > > +		mem_copy_benchmark(case_cfg, false);
> > > > > > > > +		break;
> > > > > > > > +	default:
> > > > > > > > +		printf("Unknown test type. %s\n", case_cfg-
> > > > >test_type_str);
> > > > > > > > +		break;
> > > > > > > > +	}
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void
> > > > > > > > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > > > > > > > +	uint32_t i;
> > > > > > > > +	uint32_t nb_lcores = rte_lcore_count();
> > > > > > > > +	struct test_configure_entry *mem_size = &case_cfg-
> > > > >mem_size;
> > > > > > > > +	struct test_configure_entry *buf_size = &case_cfg-
> > > > >buf_size;
> > > > > > > > +	struct test_configure_entry *ring_size = &case_cfg-
> > > > >ring_size;
> > > > > > > > +	struct test_configure_entry *kick_batch = &case_cfg-
> > > > >kick_batch;
> > > > > > > > +	struct test_configure_entry dummy = { 0 };
> > > > > > > > +	struct test_configure_entry *var_entry = &dummy;
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > > > > > > > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > > > > > > > +
> > > > > > > > +	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
> > > > > > > > +		printf("Case %u: Not enough lcores.\n", case_id);
> > > > > > > > +		return;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	printf("Number of used lcores: %u.\n", nb_lcores);
> > > > > > > > +
> > > > > > > > +	if (mem_size->incr != 0)
> > > > > > > > +		var_entry = mem_size;
> > > > > > > > +
> > > > > > > > +	if (buf_size->incr != 0)
> > > > > > > > +		var_entry = buf_size;
> > > > > > > > +
> > > > > > > > +	if (ring_size->incr != 0)
> > > > > > > > +		var_entry = ring_size;
> > > > > > > > +
> > > > > > > > +	if (kick_batch->incr != 0)
> > > > > > > > +		var_entry = kick_batch;
> > > > > > > > +
> > > > > > > > +	case_cfg->scenario_id = 0;
> > > > > > > > +
> > > > > > > > +	output_header(case_id, case_cfg);
> > > > > > > > +
> > > > > > > > +	for (var_entry->cur = var_entry->first; var_entry->cur <=
> > > > > > > > +var_entry-
> > > > > > > > >last;) {
> > > > > > > > +		case_cfg->scenario_id++;
> > > > > > > > +		printf("\nRunning scenario %d\n", case_cfg-
> > > > >scenario_id);
> > > > > > > > +
> > > > > > > > +		run_test_case(case_cfg);
> > > > > > > > +		output_csv(false);
> > > > > > > > +
> > > > > > > > +		if (var_entry->op == OP_ADD)
> > > > > > > > +			var_entry->cur += var_entry->incr;
> > > > > > > > +		else if (var_entry->op == OP_MUL)
> > > > > > > > +			var_entry->cur *= var_entry->incr;
> > > > > > > > +		else
> > > > > > > > +			break;
> > > > > > > > +	}
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int
> > > > > > > > +parse_lcore(struct test_configure *test_case, const char
> *value)
> > {
> > > > > > > > +	size_t len = strlen(value);
> > > > > > > > +	char *input = (char *) malloc((len + 1) * sizeof(char));
> > > > > > > > +	strcpy(input, value);
> > > > > > > > +	struct lcore_dma_map_t *lcore_dma_map = &(test_case-
> > > > > > > > >lcore_dma_map);
> > > > > > > > +
> > > > > > > > +	if (test_case == NULL || value == NULL)
> > > > > > > > +		return -1;
> > > > > > > > +
> > > > > > > > +	memset(lcore_dma_map, 0, sizeof(struct
> > > > lcore_dma_map_t));
> > > > > > > > +
> > > > > > > > +	char *token = strtok(input, ", ");
> > > > > > > > +	while (token != NULL) {
> > > > > > > > +		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
> > > > > > > > +			free(input);
> > > > > > > > +			return -1;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		uint16_t lcore_id = atoi(token);
> > > > > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt++] =
> > > > lcore_id;
> > > > > > > > +
> > > > > > > > +		token = strtok(NULL, ", ");
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	free(input);
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int
> > > > > > > > +parse_lcore_dma(struct test_configure *test_case, const char
> > > > *value)
> > > > > {
> > > > > > > > +	struct lcore_dma_map_t *lcore_dma_map;
> > > > > > > > +	char *input = strndup(value, strlen(value) + 1);
> > > > > > > > +	char *addrs = input;
> > > > > > > > +	char *ptrs[2];
> > > > > > > > +	char *start, *end, *substr;
> > > > > > > > +	uint16_t lcore_id;
> > > > > > > > +	int ret = 0;
> > > > > > > > +
> > > > > > > > +	while (*addrs == '\0')
> > > > > > > > +		addrs++;
> > > > > > > > +	if (*addrs == '\0') {
> > > > > > > > +		fprintf(stderr, "No input DMA addresses\n");
> > > > > > > > +		ret = -1;
> > > > > > > > +		goto out;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	substr = strtok(addrs, ",");
> > > > > > > > +	if (substr == NULL) {
> > > > > > > > +		fprintf(stderr, "No input DMA address\n");
> > > > > > > > +		ret = -1;
> > > > > > > > +		goto out;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	memset(&test_case->lcore_dma_map, 0, sizeof(struct
> > > > > > > > lcore_dma_map_t));
> > > > > > > > +
> > > > > > > > +	do {
> > > > > > > > +		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> > > > > > > > +
> > > > > > > > +		start = strstr(ptrs[0], "lcore");
> > > > > > > > +		if (start == NULL) {
> > > > > > > > +			fprintf(stderr, "Illegal lcore\n");
> > > > > > > > +			ret = -1;
> > > > > > > > +			break;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		start += 5;
> > > > > > > > +		lcore_id = strtol(start, &end, 0);
> > > > > > > > +		if (end == start) {
> > > > > > > > +			fprintf(stderr, "No input lcore ID or ID %d is
> > > > > > > > wrong\n", lcore_id);
> > > > > > > > +			ret = -1;
> > > > > > > > +			break;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		lcore_dma_map = &test_case->lcore_dma_map;
> > > > > > > > +		lcore_dma_map->lcores[lcore_dma_map->cnt] =
> > > > lcore_id;
> > > > > > > > +		strcpy(lcore_dma_map-
> > > > >dma_names[lcore_dma_map-
> > > > > > > > >cnt], ptrs[1]);
> > > > > > > > +		lcore_dma_map->cnt++;
> > > > > > > > +		substr = strtok(NULL, ",");
> > > > > > > > +	} while (substr != NULL);
> > > > > > > > +
> > > > > > > > +out:
> > > > > > > > +	free(input);
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int
> > > > > > > > +parse_entry(const char *value, struct test_configure_entry
> > *entry)
> > > > {
> > > > > > > > +	char input[255] = {0};
> > > > > > > > +	char *args[MAX_PARAMS_PER_ENTRY];
> > > > > > > > +	int args_nr = -1;
> > > > > > > > +
> > > > > > > > +	if (value == NULL || entry == NULL)
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	strncpy(input, value, 254);
> > > > > > > > +	if (*input == '\0')
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	args_nr = rte_strsplit(input, strlen(input), args,
> > > > > > > > MAX_PARAMS_PER_ENTRY, ',');
> > > > > > > > +	if (args_nr != 1 && args_nr != 4)
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > > > > > > > +
> > > > > > > > +	if (args_nr == 4) {
> > > > > > > > +		entry->last = (uint32_t)atoi(args[1]);
> > > > > > > > +		entry->incr = (uint32_t)atoi(args[2]);
> > > > > > > > +		if (!strcmp(args[3], "MUL"))
> > > > > > > > +			entry->op = OP_MUL;
> > > > > > > > +		else if (!strcmp(args[3], "ADD"))
> > > > > > > > +			entry->op = OP_ADD;
> > > > > > > > +		else {
> > > > > > > > +			printf("Invalid op %s.\n", args[3]);
> > > > > > > > +			args_nr = -1;
> > > > > > > > +		}
> > > > > > > > +	} else {
> > > > > > > > +		entry->op = OP_NONE;
> > > > > > > > +		entry->last = 0;
> > > > > > > > +		entry->incr = 0;
> > > > > > > > +	}
> > > > > > > > +out:
> > > > > > > > +	return args_nr;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static uint16_t
> > > > > > > > +load_configs(const char *path)
> > > > > > > > +{
> > > > > > > > +	struct rte_cfgfile *cfgfile;
> > > > > > > > +	int nb_sections, i;
> > > > > > > > +	struct test_configure *test_case;
> > > > > > > > +	char section_name[CFG_NAME_LEN];
> > > > > > > > +	const char *case_type;
> > > > > > > > +	const char *lcore_dma;
> > > > > > > > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> > > > > > > > *kick_batch_str;
> > > > > > > > +	int args_nr, nb_vp;
> > > > > > > > +	bool is_dma;
> > > > > > > > +
> > > > > > > > +	printf("config file parsing...\n");
> > > > > > > > +	cfgfile = rte_cfgfile_load(path, 0);
> > > > > > > > +	if (!cfgfile) {
> > > > > > > > +		printf("Open configure file error.\n");
> > > > > > > > +		exit(1);
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > > > > > > > +	if (nb_sections > MAX_TEST_CASES) {
> > > > > > > > +		printf("Error: The maximum number of cases is
> > > > %d.\n",
> > > > > > > > MAX_TEST_CASES);
> > > > > > > > +		exit(1);
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < nb_sections; i++) {
> > > > > > > > +		snprintf(section_name, CFG_NAME_LEN, "case%d", i
> > > > + 1);
> > > > > > > > +		test_case = &test_cases[i];
> > > > > > > > +		case_type = rte_cfgfile_get_entry(cfgfile,
> > > > section_name,
> > > > > > > > "type");
> > > > > > > > +		if (!case_type) {
> > > > > > > > +			printf("Error: No case type in case %d, the
> > > > test will be
> > > > > > > > finished here.\n",
> > > > > > > > +				i + 1);
> > > > > > > > +			test_case->is_valid = false;
> > > > > > > > +			continue;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
> > > > > > > > +			test_case->test_type =
> > > > > > > > TEST_TYPE_DMA_MEM_COPY;
> > > > > > > > +			test_case->test_type_str =
> > > > DMA_MEM_COPY;
> > > > > > > > +			is_dma = true;
> > > > > > > > +		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
> > > > > > > > +			test_case->test_type =
> > > > > > > > TEST_TYPE_CPU_MEM_COPY;
> > > > > > > > +			test_case->test_type_str =
> > > > CPU_MEM_COPY;
> > > > > > > > +			is_dma = false;
> > > > > > > > +		} else {
> > > > > > > > +			printf("Error: Cannot find case type %s in
> > > > case%d.\n",
> > > > > > > > case_type, i + 1);
> > > > > > > > +			test_case->is_valid = false;
> > > > > > > > +			continue;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		nb_vp = 0;
> > > > > > > > +
> > > > > > > > +		test_case->src_numa_node =
> > > > > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > > > +
> > > > > > > > 	section_name, "src_numa_node"));
> > > > > > > > +		test_case->dst_numa_node =
> > > > > > > > (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > > > +
> > > > > > > > 	section_name, "dst_numa_node"));
> > > > > > > > +
> > > > > > > > +		mem_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > section_name,
> > > > > > > > "mem_size");
> > > > > > > > +		args_nr = parse_entry(mem_size_str, &test_case-
> > > > > > > > >mem_size);
> > > > > > > > +		if (args_nr < 0) {
> > > > > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > > > > +			test_case->is_valid = false;
> > > > > > > > +			continue;
> > > > > > > > +		} else if (args_nr > 1)
> > > > > > > > +			nb_vp++;
> > > > > > > > +
> > > > > > > > +		buf_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > section_name,
> > > > > > > > "buf_size");
> > > > > > > > +		args_nr = parse_entry(buf_size_str, &test_case-
> > > > >buf_size);
> > > > > > > > +		if (args_nr < 0) {
> > > > > > > > +			printf("parse error in case %d.\n", i + 1);
> > > > > > > > +			test_case->is_valid = false;
> > > > > > > > +			continue;
> > > > > > > > +		} else if (args_nr > 1)
> > > > > > > > +			nb_vp++;
> > > > > > > > +
> > > > > > > > +		if (is_dma) {
> > > > > > > > +			ring_size_str = rte_cfgfile_get_entry(cfgfile,
> > > > > > > > section_name,
> > > > > > > > +
> > > > > > > > 	"dma_ring_size");
> > > > > > > > +			args_nr = parse_entry(ring_size_str,
> > > > &test_case-
> > > > > > > > >ring_size);
> > > > > > > > +			if (args_nr < 0) {
> > > > > > > > +				printf("parse error in case %d.\n", i +
> > > > 1);
> > > > > > > > +				test_case->is_valid = false;
> > > > > > > > +				continue;
> > > > > > > > +			} else if (args_nr > 1)
> > > > > > > > +				nb_vp++;
> > > > > > > > +
> > > > > > > > +			kick_batch_str =
> > > > rte_cfgfile_get_entry(cfgfile,
> > > > > > > > section_name, "kick_batch");
> > > > > > > > +			args_nr = parse_entry(kick_batch_str,
> > > > &test_case-
> > > > > > > > >kick_batch);
> > > > > > > > +			if (args_nr < 0) {
> > > > > > > > +				printf("parse error in case %d.\n", i +
> > > > 1);
> > > > > > > > +				test_case->is_valid = false;
> > > > > > > > +				continue;
> > > > > > > > +			} else if (args_nr > 1)
> > > > > > > > +				nb_vp++;
> > > > > > > > +
> > > > > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > > > > section_name, "lcore_dma");
> > > > > > > > +			int lcore_ret = parse_lcore_dma(test_case,
> > > > > > > > lcore_dma);
> > > > > > > > +			if (lcore_ret < 0) {
> > > > > > > > +				printf("parse lcore dma error in case
> > > > %d.\n", i
> > > > > > > 1);
> > > > > > > > +				test_case->is_valid = false;
> > > > > > > > +				continue;
> > > > > > > > +			}
> > > > > > > > +		} else {
> > > > > > > > +			lcore_dma = rte_cfgfile_get_entry(cfgfile,
> > > > > > > > section_name, "lcore");
> > > > > > > > +			int lcore_ret = parse_lcore(test_case,
> > > > lcore_dma);
> > > > > > > > +			if (lcore_ret < 0) {
> > > > > > > > +				printf("parse lcore error in case
> > > > %d.\n", i + 1);
> > > > > > > > +				test_case->is_valid = false;
> > > > > > > > +				continue;
> > > > > > > > +			}
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		if (nb_vp > 1) {
> > > > > > > > +			printf("Error, each section can only have a
> > > > single
> > > > > > > > variable parameter.\n");
> > > > > > > > +			test_case->is_valid = false;
> > > > > > > > +			continue;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		test_case->cache_flush =
> > > > > > > > +			(int)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > section_name,
> > > > > > > > "cache_flush"));
> > > > > > > > +		test_case->test_secs =
> > > > > > > > (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > > > > > > > +					section_name,
> > > > "test_seconds"));
> > > > > > > > +
> > > > > > > > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> > > > > > > > section_name, "eal_args");
> > > > > > > > +		test_case->is_valid = true;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	rte_cfgfile_close(cfgfile);
> > > > > > > > +	printf("config file parsing complete.\n\n");
> > > > > > > > +	return i;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/* Parse the argument given in the command line of the
> > > > > > > > +application */ static int append_eal_args(int argc, char **argv,
> > > > > > > > +const char *eal_args, char **new_argv) {
> > > > > > > > +	int i;
> > > > > > > > +	char *tokens[MAX_EAL_PARAM_NB];
> > > > > > > > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > > > > > > > +	int token_nb, new_argc = 0;
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < argc; i++) {
> > > > > > > > +		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
> > > > > > > > +				(strcmp(argv[i],
> > > > CMDLINE_RESULT_ARG) ==
> > > > > > > > 0)) {
> > > > > > > > +			i++;
> > > > > > > > +			continue;
> > > > > > > > +		}
> > > > > > > > +		strlcpy(new_argv[new_argc], argv[i],
> > > > > > > > sizeof(new_argv[new_argc]));
> > > > > > > > +		new_argc++;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	if (eal_args) {
> > > > > > > > +		strlcpy(args, eal_args, sizeof(args));
> > > > > > > > +		token_nb = rte_strsplit(args, strlen(args),
> > > > > > > > +					tokens,
> > > > MAX_EAL_PARAM_NB, ' ');
> > > > > > > > +		for (i = 0; i < token_nb; i++)
> > > > > > > > +			strcpy(new_argv[new_argc++], tokens[i]);
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	return new_argc;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +int
> > > > > > > > +main(int argc, char *argv[])
> > > > > > > > +{
> > > > > > > > +	int ret;
> > > > > > > > +	uint16_t case_nb;
> > > > > > > > +	uint32_t i, nb_lcores;
> > > > > > > > +	pid_t cpid, wpid;
> > > > > > > > +	int wstatus;
> > > > > > > > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > > > > > > > +	char *pargs[MAX_EAL_PARAM_NB];
> > > > > > > > +	char *cfg_path_ptr = NULL;
> > > > > > > > +	char *rst_path_ptr = NULL;
> > > > > > > > +	char rst_path[PATH_MAX];
> > > > > > > > +	int new_argc;
> > > > > > > > +	bool is_first_case = true;
> > > > > > > > +
> > > > > > > > +	memset(args, 0, sizeof(args));
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < RTE_DIM(pargs); i++)
> > > > > > > > +		pargs[i] = args[i];
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < (uint32_t)argc; i++) {
> > > > > > > > +		if (strncmp(argv[i], CMDLINE_CONFIG_ARG,
> > > > > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > > > > +			cfg_path_ptr = argv[i + 1];
> > > > > > > > +		if (strncmp(argv[i], CMDLINE_RESULT_ARG,
> > > > > > > > MAX_LONG_OPT_SZ) == 0)
> > > > > > > > +			rst_path_ptr = argv[i + 1];
> > > > > > > > +	}
> > > > > > > > +	if (cfg_path_ptr == NULL) {
> > > > > > > > +		printf("Config file not assigned.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +	if (rst_path_ptr == NULL) {
> > > > > > > > +		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
> > > > > > > > +		strcat(strtok(basename(rst_path), "."),
> > > > "_result.csv");
> > > > > > > > +		rst_path_ptr = rst_path;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	case_nb = load_configs(cfg_path_ptr);
> > > > > > > > +	fd = fopen(rst_path_ptr, "w");
> > > > > > > > +	if (fd == NULL) {
> > > > > > > > +		printf("Open output CSV file error.\n");
> > > > > > > > +		return -1;
> > > > > > > > +	}
> > > > > > > > +	fclose(fd);
> > > > > > > > +
> > > > > > > > +	for (i = 0; i < case_nb; i++) {
> > > > > > > > +		if (test_cases[i].test_type == TEST_TYPE_NONE) {
> > > > > > > > +			printf("No test type in test case %d.\n\n", i +
> > > > 1);
> > > > > > > > +			continue;
> > > > > > > > +		}
> > > > > > > > +		if (!test_cases[i].is_valid) {
> > > > > > > > +			printf("Invalid test case %d.\n\n", i + 1);
> > > > > > > > +			continue;
> > > > > > > > +		}
> > > > > > > > +
> > > > > > > > +		cpid = fork();
> > > > > > >
> > > > > > > [Anoob] Do we really need fork()? Can't we use code like,
> > > > > > >
> > > > > > > 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
> > > > > > > 			ret |= rte_eal_wait_lcore(lcore_id);
> > > > > > > 		}
> > > > > > >
> > > > > > > to wait for all threads to exit?
> > > > > >
> > > > > > [Cheng] Good question. Fork() is used here to establish a new
> > process
> > > > > > for the new test case. In order for each test case to have a new EAL
> > > > > > environment (for the flexibility), the EAL must be reinitialized for
> each
> > > > case.
> > > > > > However, the EAL parameters can only be initialized once per
> process.
> > > > > > Therefore, we use a new process to run each new test case.
> > Moreover,
> > > > > > each test case runs sequentially and does not affect the others,
> > > > > > ensuring the accuracy of the performance data. Your code would
> wait
> > > > > > for all threads to exit in the same process. However, it would not
> > > provide
> > > > a
> > > > > "clean"
> > > > > > environment for each test case like fork() does. Fork() allows us to
> > > > > > have a fully reinitialized environment, with no impact or side effects
> > > > > > from previous test cases. This results in clean, precise performance
> > data
> > > > for
> > > > > each case.
> > > > > >
> > > > > > Please let me know your thoughts on this. And please let me know
> if
> > > > > > you have any other questions or require any clarification.
> > > > >
> > > > > [Anoob] This was just a generic observation. I do not have a strong
> > > opinion
> > > > > either way.
> > > > >
> > > >
> > > > [Cheng] sure, got it.
> > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Cheng
> > > > > >
> > > > > > >
> > > > > > > > +		if (cpid < 0) {
> > > > > > > > +			printf("Fork case %d failed.\n", i + 1);
> > > > > > > > +			exit(EXIT_FAILURE);
> > > > > > > > +		} else if (cpid == 0) {
> > > > > > > > +			printf("\nRunning case %u\n\n", i + 1);
> > > > > > > > +
> > > > > > > > +			new_argc = append_eal_args(argc, argv,
> > > > > > > > test_cases[i].eal_args, pargs);
> > > > > > > > +			ret = rte_eal_init(new_argc, pargs);
> > > > > > > > +			if (ret < 0)
> > > > > > > > +				rte_exit(EXIT_FAILURE, "Invalid EAL
> > > > > > > > arguments\n");
> > > > > > > > +
> > > > > > > > +			/* Check lcores. */
> > > > > > > > +			nb_lcores = rte_lcore_count();
> > > > > > > > +			if (nb_lcores < 2)
> > > > > > > > +				rte_exit(EXIT_FAILURE,
> > > > > > > > +					"There should be at least 2
> > > > worker
> > > > > > > > lcores.\n");
> > > > > > > > +
> > > > > > > > +			fd = fopen(rst_path_ptr, "a");
> > > > > > > > +			if (!fd) {
> > > > > > > > +				printf("Open output CSV file
> > > > error.\n");
> > > > > > > > +				return 0;
> > > > > > > > +			}
> > > > > > > > +
> > > > > > > > +			if (is_first_case) {
> > > > > > > > +				output_env_info();
> > > > > > > > +				is_first_case = false;
> > > > > > > > +			}
> > > > > > > > +			run_test(i + 1, &test_cases[i]);
> > > > > > > > +
> > > > > > > > +			/* clean up the EAL */
> > > > > > > > +			rte_eal_cleanup();
> > > > > > > > +
> > > > > > > > +			fclose(fd);
> > > > > > > > +
> > > > > > > > +			printf("\nCase %u completed.\n\n", i + 1);
> > > > > > > > +
> > > > > > > > +			exit(EXIT_SUCCESS);
> > > > > > > > +		} else {
> > > > > > > > +			wpid = waitpid(cpid, &wstatus, 0);
> > > > > > > > +			if (wpid == -1) {
> > > > > > > > +				printf("waitpid error.\n");
> > > > > > > > +				exit(EXIT_FAILURE);
> > > > > > > > +			}
> > > > > > > > +
> > > > > > > > +			if (WIFEXITED(wstatus))
> > > > > > > > +				printf("Case process exited. status
> > > > %d\n\n",
> > > > > > > > +					WEXITSTATUS(wstatus));
> > > > > > > > +			else if (WIFSIGNALED(wstatus))
> > > > > > > > +				printf("Case process killed by signal
> > > > %d\n\n",
> > > > > > > > +					WTERMSIG(wstatus));
> > > > > > > > +			else if (WIFSTOPPED(wstatus))
> > > > > > > > +				printf("Case process stopped by
> > > > signal
> > > > > > > > %d\n\n",
> > > > > > > > +					WSTOPSIG(wstatus));
> > > > > > > > +			else if (WIFCONTINUED(wstatus))
> > > > > > > > +				printf("Case process
> > > > continued.\n\n");
> > > > > > > > +			else
> > > > > > > > +				printf("Case process unknown
> > > > > > > > terminated.\n\n");
> > > > > > > > +		}
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	printf("Bye...\n");
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > diff --git a/app/test-dma-perf/main.h b/app/test-dma-
> > perf/main.h
> > > > > > > > new file mode 100644 index 0000000000..215ac42673
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/app/test-dma-perf/main.h
> > > > > > > > @@ -0,0 +1,69 @@
> > > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > > + * Copyright(c) 2023 Intel Corporation  */
> > > > > > > > +
> > > > > > > > +#ifndef _MAIN_H_
> > > > > > > > +#define _MAIN_H_
> > > > > > > > +
> > > > > > > > +
> > > > > > > > +#include <rte_common.h>
> > > > > > > > +#include <rte_cycles.h>
> > > > > > > > +#include <rte_dev.h>
> > > > > > > > +#include <rte_dmadev.h>
> > > > > > > > +
> > > > > > > > +#ifndef __maybe_unused
> > > > > > > > +#define __maybe_unused	__rte_unused
> > > > > > > > +#endif
> > > > > > > > +
> > > > > > > > +#define MAX_WORKER_NB 128
> > > > > > > > +#define MAX_OUTPUT_STR_LEN 512
> > > > > > > > +
> > > > > > > > +#define MAX_DMA_NB 128
> > > > > > > > +#define MAX_LCORE_NB 256
> > > > > > > > +
> > > > > > > > +extern char
> > > > output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > > > > > > > +
> > > > > > > > +typedef enum {
> > > > > > > > +	OP_NONE = 0,
> > > > > > > > +	OP_ADD,
> > > > > > > > +	OP_MUL
> > > > > > > > +} alg_op_type;
> > > > > > > > +
> > > > > > > > +struct test_configure_entry {
> > > > > > > > +	uint32_t first;
> > > > > > > > +	uint32_t last;
> > > > > > > > +	uint32_t incr;
> > > > > > > > +	alg_op_type op;
> > > > > > > > +	uint32_t cur;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +struct lcore_dma_map_t {
> > > > > > > > +	uint32_t lcores[MAX_WORKER_NB];
> > > > > > > > +	char
> > > > dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
> > > > > > > > +	int16_t dma_ids[MAX_WORKER_NB];
> > > > > > > > +	uint16_t cnt;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +struct test_configure {
> > > > > > > > +	bool is_valid;
> > > > > > > > +	uint8_t test_type;
> > > > > > > > +	const char *test_type_str;
> > > > > > > > +	uint16_t src_numa_node;
> > > > > > > > +	uint16_t dst_numa_node;
> > > > > > > > +	uint16_t opcode;
> > > > > > > > +	bool is_dma;
> > > > > > > > +	struct lcore_dma_map_t lcore_dma_map;
> > > > > > > > +	struct test_configure_entry mem_size;
> > > > > > > > +	struct test_configure_entry buf_size;
> > > > > > > > +	struct test_configure_entry ring_size;
> > > > > > > > +	struct test_configure_entry kick_batch;
> > > > > > > > +	uint32_t cache_flush;
> > > > > > > > +	uint32_t nr_buf;
> > > > > > > > +	uint16_t test_secs;
> > > > > > > > +	const char *eal_args;
> > > > > > > > +	uint8_t scenario_id;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +void mem_copy_benchmark(struct test_configure *cfg, bool
> > > > is_dma);
> > > > > > > > +
> > > > > > > > +#endif /* _MAIN_H_ */
> > > > > > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > > > > > perf/meson.build new file mode 100644 index
> > > > 0000000000..bd6c264002
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/app/test-dma-perf/meson.build
> > > > > > > > @@ -0,0 +1,17 @@
> > > > > > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-
> 2023
> > > > > > > > +Intel Corporation
> > > > > > > > +
> > > > > > > > +# meson file, for building this app as part of a main DPDK build.
> > > > > > > > +
> > > > > > > > +if is_windows
> > > > > > > > +    build = false
> > > > > > > > +    reason = 'not supported on Windows'
> > > > > > > > +    subdir_done()
> > > > > > > > +endif
> > > > > > > > +
> > > > > > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > > > > > +
> > > > > > > > +sources = files(
> > > > > > > > +        'main.c',
> > > > > > > > +        'benchmark.c',
> > > > > > > > +)
> > > > > > > > --
> > > > > > > > 2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16  8:43                 ` Jiang, Cheng1
@ 2023-06-16  9:48                   ` Anoob Joseph
  2023-06-16 10:52                     ` Anoob Joseph
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-16  9:48 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Cheng,

> [Cheng] sure, no problem. Feel free to contact me if you have any more
> questions.

No further comments from my side. Can you submit next version?

Top level, I think you may need to add a doc file for the new app. 

Few issues that we are also checking in our end,
1. The app is having significant memory footprint. Need to see where we can improve.
2. EAL args passing doesn't seem to be working. Are you able to enable specific devices by making using of '-a' arg?

Thanks,
Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16  9:48                   ` Anoob Joseph
@ 2023-06-16 10:52                     ` Anoob Joseph
  2023-06-16 15:15                       ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-16 10:52 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Cheng,

I think there is a bug in EAL parsing. Please check the below diff and see if we should do something similar.

diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index d65655b87b..7fcaa5acf6 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -432,7 +432,7 @@ append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
                        i++;
                        continue;
                }
-               strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+               strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
                new_argc++;
        }

Thanks,
Anoob

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Friday, June 16, 2023 3:19 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> > [Cheng] sure, no problem. Feel free to contact me if you have any more
> > questions.
> 
> No further comments from my side. Can you submit next version?
> 
> Top level, I think you may need to add a doc file for the new app.
> 
> Few issues that we are also checking in our end, 1. The app is having
> significant memory footprint. Need to see where we can improve.
> 2. EAL args passing doesn't seem to be working. Are you able to enable
> specific devices by making using of '-a' arg?
> 
> Thanks,
> Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16 10:52                     ` Anoob Joseph
@ 2023-06-16 15:15                       ` Jiang, Cheng1
  2023-06-17  4:35                         ` Jiang, Cheng1
  2023-06-18  5:34                         ` Jiang, Cheng1
  0 siblings, 2 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-16 15:15 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Friday, June 16, 2023 6:53 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> I think there is a bug in EAL parsing. Please check the below diff and see if we
> should do something similar.
> 
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c index
> d65655b87b..7fcaa5acf6 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -432,7 +432,7 @@ append_eal_args(int argc, char **argv, const char
> *eal_args, char **new_argv)
>                         i++;
>                         continue;
>                 }
> -               strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
> +               strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
>                 new_argc++;
>         }
> 
> Thanks,
> Anoob

[Cheng] yes there is an issue in it. And I have the same fix. I'll submit it later, thanks.

> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Friday, June 16, 2023 3:19 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang,
> > YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Cheng,
> >
> > > [Cheng] sure, no problem. Feel free to contact me if you have any
> > > more questions.
> >
> > No further comments from my side. Can you submit next version?

[Cheng] Glad to know. Yes, the next version is almost ready, I'll submit it this weekend.

> >
> > Top level, I think you may need to add a doc file for the new app.

[Cheng] sure, I was thinking about this, and I'll add a doc for this, thanks.

> >
> > Few issues that we are also checking in our end, 1. The app is having
> > significant memory footprint. Need to see where we can improve.
> > 2. EAL args passing doesn't seem to be working. Are you able to enable
> > specific devices by making using of '-a' arg?

[Cheng] I'll take a look at the significant memory footprint issue later, thanks. As for the eal args process, yes there is an issue, Dengdui's comments also mentioned it, and I have fixed it in the v7 patch.

> >
> > Thanks,
> > Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16 15:15                       ` Jiang, Cheng1
@ 2023-06-17  4:35                         ` Jiang, Cheng1
  2023-06-19  5:48                           ` Anoob Joseph
  2023-06-18  5:34                         ` Jiang, Cheng1
  1 sibling, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-17  4:35 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

I have a question about the doc.
Do you think I should add a folder in the doc/guides/ just like doc/guides/dma_perf?

Thanks,
Cheng



> -----Original Message-----
> From: Jiang, Cheng1
> Sent: Friday, June 16, 2023 11:16 PM
> To: Anoob Joseph <anoobj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> <Xuan.Ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <Chenbo.Xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <WenwuX.Ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Anoob,
> 
> Replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Friday, June 16, 2023 6:53 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang,
> > YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Cheng,
> >
> > I think there is a bug in EAL parsing. Please check the below diff and
> > see if we should do something similar.
> >
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c index
> > d65655b87b..7fcaa5acf6 100644
> > --- a/app/test-dma-perf/main.c
> > +++ b/app/test-dma-perf/main.c
> > @@ -432,7 +432,7 @@ append_eal_args(int argc, char **argv, const char
> > *eal_args, char **new_argv)
> >                         i++;
> >                         continue;
> >                 }
> > -               strlcpy(new_argv[new_argc], argv[i],
> sizeof(new_argv[new_argc]));
> > +               strlcpy(new_argv[new_argc], argv[i],
> > + MAX_EAL_PARAM_LEN);
> >                 new_argc++;
> >         }
> >
> > Thanks,
> > Anoob
> 
> [Cheng] yes there is an issue in it. And I have the same fix. I'll submit it later,
> thanks.
> 
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Friday, June 16, 2023 3:19 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang,
> > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi Cheng,
> > >
> > > > [Cheng] sure, no problem. Feel free to contact me if you have any
> > > > more questions.
> > >
> > > No further comments from my side. Can you submit next version?
> 
> [Cheng] Glad to know. Yes, the next version is almost ready, I'll submit it this
> weekend.
> 
> > >
> > > Top level, I think you may need to add a doc file for the new app.
> 
> [Cheng] sure, I was thinking about this, and I'll add a doc for this, thanks.
> 
> > >
> > > Few issues that we are also checking in our end, 1. The app is
> > > having significant memory footprint. Need to see where we can improve.
> > > 2. EAL args passing doesn't seem to be working. Are you able to
> > > enable specific devices by making using of '-a' arg?
> 
> [Cheng] I'll take a look at the significant memory footprint issue later, thanks.
> As for the eal args process, yes there is an issue, Dengdui's comments also
> mentioned it, and I have fixed it in the v7 patch.
> 
> > >
> > > Thanks,
> > > Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-16 15:15                       ` Jiang, Cheng1
  2023-06-17  4:35                         ` Jiang, Cheng1
@ 2023-06-18  5:34                         ` Jiang, Cheng1
  2023-06-19  5:25                           ` Anoob Joseph
  1 sibling, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-18  5:34 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

I've looked into the memory footprint issue, and I didn't find any.
So could you please help to share the config file you are using? Maybe it can help me to accelerate the debug process.

Thanks a lot,
Cheng

> -----Original Message-----
> From: Jiang, Cheng1
> Sent: Friday, June 16, 2023 11:16 PM
> To: Anoob Joseph <anoobj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> <Xuan.Ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <Chenbo.Xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <WenwuX.Ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Anoob,
> 
> Replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Friday, June 16, 2023 6:53 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang,
> > YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Cheng,
> >
> > I think there is a bug in EAL parsing. Please check the below diff and
> > see if we should do something similar.
> >
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c index
> > d65655b87b..7fcaa5acf6 100644
> > --- a/app/test-dma-perf/main.c
> > +++ b/app/test-dma-perf/main.c
> > @@ -432,7 +432,7 @@ append_eal_args(int argc, char **argv, const char
> > *eal_args, char **new_argv)
> >                         i++;
> >                         continue;
> >                 }
> > -               strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
> > +               strlcpy(new_argv[new_argc], argv[i],
> > + MAX_EAL_PARAM_LEN);
> >                 new_argc++;
> >         }
> >
> > Thanks,
> > Anoob
> 
> [Cheng] yes there is an issue in it. And I have the same fix. I'll submit it later,
> thanks.
> 
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Friday, June 16, 2023 3:19 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang,
> > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi Cheng,
> > >
> > > > [Cheng] sure, no problem. Feel free to contact me if you have any
> > > > more questions.
> > >
> > > No further comments from my side. Can you submit next version?
> 
> [Cheng] Glad to know. Yes, the next version is almost ready, I'll submit it this
> weekend.
> 
> > >
> > > Top level, I think you may need to add a doc file for the new app.
> 
> [Cheng] sure, I was thinking about this, and I'll add a doc for this, thanks.
> 
> > >
> > > Few issues that we are also checking in our end, 1. The app is
> > > having significant memory footprint. Need to see where we can improve.
> > > 2. EAL args passing doesn't seem to be working. Are you able to
> > > enable specific devices by making using of '-a' arg?
> 
> [Cheng] I'll take a look at the significant memory footprint issue later, thanks.
> As for the eal args process, yes there is an issue, Dengdui's comments also
> mentioned it, and I have fixed it in the v7 patch.
> 
> > >
> > > Thanks,
> > > Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v7] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (4 preceding siblings ...)
  2023-06-13  4:31 ` [PATCH v6] " Cheng Jiang
@ 2023-06-18 12:26 ` Cheng Jiang
  2023-06-20  6:53 ` [PATCH v8] " Cheng Jiang
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-06-18 12:26 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v7:
  fixed some strcpy issues;
  removed cache setup in calling rte_pktmbuf_pool_create();
  fixed some typos;
  added some memory free and null set operations;
  improved result calculation;
v6:
  improved code based on Anoob's comments;
  fixed some code structure issues;
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 498 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  61 ++++
 app/test-dma-perf/main.c      | 594 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 ++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1240 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..b866d5e5c0
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,498 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+#define POLL_MAX 1000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+union lcore_params_union {
+	volatile struct lcore_params *v_ptr;
+	struct lcore_params *ptr;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static union lcore_params_union lcores_p[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	float ops;
+
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	ops = (float)total_cnt / test_secs;
+	*mops = ops / (1000 * 1000);
+	*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, float memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %.2lfMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static inline void
+do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
+			volatile struct worker_info *worker_info)
+{
+	int ret;
+	uint16_t nr_cpl;
+
+	ret = rte_dma_submit(dev_id, 0);
+	if (ret < 0) {
+		rte_dma_stop(dev_id);
+		rte_dma_close(dev_id);
+		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
+	}
+
+	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+	*async_cnt -= nr_cpl;
+	worker_info->total_cpl += nr_cpl;
+}
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	const uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]),
+				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else {
+					/* Error exit */
+					rte_dma_stop(dev_id);
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	const uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint32_t nr_buf = para->nr_buf;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+		lcores_p[i].v_ptr = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!lcores_p[i].v_ptr) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			lcores_p[i].v_ptr->dma_name = ldm->dma_names[i];
+			lcores_p[i].v_ptr->dev_id = ldm->dma_ids[i];
+			lcores_p[i].v_ptr->kick_batch = kick_batch;
+		}
+		lcores_p[i].v_ptr->worker_id = i;
+		lcores_p[i].v_ptr->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		lcores_p[i].v_ptr->buf_size = buf_size;
+		lcores_p[i].v_ptr->test_secs = test_secs;
+		lcores_p[i].v_ptr->srcs = srcs + offset;
+		lcores_p[i].v_ptr->dsts = dsts + offset;
+		lcores_p[i].v_ptr->scenario_id = cfg->scenario_id;
+		lcores_p[i].v_ptr->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (lcores_p[i].v_ptr->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.test_cpl = lcores_p[i].v_ptr->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.test_cpl = lcores_p[i].v_ptr->worker_info.total_cpl -
+						lcores_p[i].v_ptr->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			lcores_p[i].v_ptr->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, lcores_p[i].v_ptr->lcore_id,
+					lcores_p[i].v_ptr->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free mbufs used in the test */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	/* free the points for the mbufs */
+	rte_free(srcs);
+	srcs = NULL;
+	rte_free(dsts);
+	dsts = NULL;
+
+	if (src_pool) {
+		rte_mempool_free(src_pool);
+		src_pool = NULL;
+	}
+	if (dst_pool) {
+		rte_mempool_free(dst_pool);
+		src_pool = NULL;
+	}
+
+	/* free the worker parameters */
+	for (i = 0; i < nb_workers; i++) {
+		rte_free(lcores_p[i].ptr);
+		lcores_p[i].ptr = NULL;
+	}
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..b550f4b23f
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
+;  64 and 4096.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first,last,increment,ADD|MUL.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+; We have to ensure a 1:1 mapping between the core and DMA device.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..b782ea5258
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
+			PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else
+			break;
+	}
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	uint16_t len;
+	char *input;
+	struct lcore_dma_map_t *lcore_dma_map;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	len = strlen(value);
+	input = (char *)malloc((len + 1) * sizeof(char));
+	strlcpy(input, value, len);
+	lcore_dma_map = &(test_case->lcore_dma_map);
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input, *addrs;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	input = strndup(value, strlen(value) + 1);
+	addrs = input;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {
+			fprintf(stderr, "Illegal DMA address\n");
+			ret = -1;
+			break;
+		}
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			fprintf(stderr, "lcores count error\n");
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1],
+				RTE_DEV_NAME_MAX_LEN);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr != 1 && args_nr != 4)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+
+	if (args_nr == 4) {
+		entry->last = (uint32_t)atoi(args[1]);
+		entry->incr = (uint32_t)atoi(args[2]);
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s.\n", args[3]);
+			args_nr = -1;
+		}
+	} else {
+		entry->op = OP_NONE;
+		entry->last = 0;
+		entry->incr = 0;
+	}
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, MAX_EAL_PARAM_LEN);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strlcpy(new_argv[new_argc++], tokens[i], MAX_EAL_PARAM_LEN);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		char *token = strtok(basename(rst_path), ".");
+		if (token == NULL) {
+			printf("Config file error.\n");
+			return -1;
+		}
+		strcat(token, "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-18  5:34                         ` Jiang, Cheng1
@ 2023-06-19  5:25                           ` Anoob Joseph
  2023-06-19  6:17                             ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-19  5:25 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Cheng,

Please see inline.

Thanks,
Anoob

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Sunday, June 18, 2023 11:05 AM
> To: Anoob Joseph <anoobj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Anoob,
> 
> I've looked into the memory footprint issue, and I didn't find any.
> So could you please help to share the config file you are using? Maybe it can
> help me to accelerate the debug process.

[Anoob] We had to increase the hugepages to get the test running. Otherwise simple memory allocations were failing. Some of structs have redundant fields which can be easily addressed.

For example please check below members in struct lcore_params,
1. scenario_id
2. lcore_id
3. dma_name
4. worker_id
5. test_secs

Also, some of the parameters in the above struct is const for a thread (like buf_size, kick_batch etc). So I was thinking may be it is better to split lcore_params into two portions and have only the dynamic part as volatile. But that is something we can take up later. I leave that to your judgement.

> 
> Thanks a lot,
> Cheng
> 
> > -----Original Message-----
> > From: Jiang, Cheng1
> > Sent: Friday, June 16, 2023 11:16 PM
> > To: Anoob Joseph <anoobj@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> > <Xuan.Ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <Chenbo.Xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <WenwuX.Ma@intel.com>;
> Wang,
> > YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Anoob,
> >
> > Replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Friday, June 16, 2023 6:53 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > Wang,
> > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi Cheng,
> > >
> > > I think there is a bug in EAL parsing. Please check the below diff
> > > and see if we should do something similar.
> > >
> > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > > index
> > > d65655b87b..7fcaa5acf6 100644
> > > --- a/app/test-dma-perf/main.c
> > > +++ b/app/test-dma-perf/main.c
> > > @@ -432,7 +432,7 @@ append_eal_args(int argc, char **argv, const
> > > char *eal_args, char **new_argv)
> > >                         i++;
> > >                         continue;
> > >                 }
> > > -               strlcpy(new_argv[new_argc], argv[i],
> sizeof(new_argv[new_argc]));
> > > +               strlcpy(new_argv[new_argc], argv[i],
> > > + MAX_EAL_PARAM_LEN);
> > >                 new_argc++;
> > >         }
> > >
> > > Thanks,
> > > Anoob
> >
> > [Cheng] yes there is an issue in it. And I have the same fix. I'll
> > submit it later, thanks.
> >
> > >
> > > > -----Original Message-----
> > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > Sent: Friday, June 16, 2023 3:19 PM
> > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> > Chenbo
> > > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > > <amitprakashs@marvell.com>; Ma, WenwuX
> <wenwux.ma@intel.com>;
> > > Wang,
> > > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > Hi Cheng,
> > > >
> > > > > [Cheng] sure, no problem. Feel free to contact me if you have
> > > > > any more questions.
> > > >
> > > > No further comments from my side. Can you submit next version?
> >
> > [Cheng] Glad to know. Yes, the next version is almost ready, I'll
> > submit it this weekend.
> >
> > > >
> > > > Top level, I think you may need to add a doc file for the new app.
> >
> > [Cheng] sure, I was thinking about this, and I'll add a doc for this, thanks.
> >
> > > >
> > > > Few issues that we are also checking in our end, 1. The app is
> > > > having significant memory footprint. Need to see where we can
> improve.
> > > > 2. EAL args passing doesn't seem to be working. Are you able to
> > > > enable specific devices by making using of '-a' arg?
> >
> > [Cheng] I'll take a look at the significant memory footprint issue later,
> thanks.
> > As for the eal args process, yes there is an issue, Dengdui's comments
> > also mentioned it, and I have fixed it in the v7 patch.
> >
> > > >
> > > > Thanks,
> > > > Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-17  4:35                         ` Jiang, Cheng1
@ 2023-06-19  5:48                           ` Anoob Joseph
  2023-06-19  6:21                             ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-19  5:48 UTC (permalink / raw)
  To: Jiang, Cheng1, Richardson, Bruce, thomas, Kevin Laatz, Chengwen Feng
  Cc: dev, Hu, Jiayu, Ding, Xuan, mb, Xia, Chenbo, Amit Prakash Shukla,
	Ma, WenwuX, Wang, YuanX, He, Xingguang,
	Jerin Jacob Kollanukkaran

Hi Cheng,

> Do you think I should add a folder in the doc/guides/ just like
> doc/guides/dma_perf?

I think it should be under doc/guides/tools. I'll let Thomas & DMA maintainers comment further.

You can use below patch as a reference,

commit 1f5cfe964eefe96b5f8de1fadf9cc8fd1e214240
Author: Anoob Joseph <anoobj@marvell.com>
Date:   Thu Nov 3 18:16:11 2022 +0530

    app/security-perf: add session performance test

Thanks,
Anoob


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-19  5:25                           ` Anoob Joseph
@ 2023-06-19  6:17                             ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-19  6:17 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: dev, Hu, Jiayu, Ding, Xuan, thomas, Richardson, Bruce, mb, Xia,
	Chenbo, Amit Prakash Shukla, Ma, WenwuX, Wang, YuanX, He,
	Xingguang

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Monday, June 19, 2023 1:26 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Please see inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Sunday, June 18, 2023 11:05 AM
> > To: Anoob Joseph <anoobj@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang,
> > YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Anoob,
> >
> > I've looked into the memory footprint issue, and I didn't find any.
> > So could you please help to share the config file you are using? Maybe
> > it can help me to accelerate the debug process.
> 
> [Anoob] We had to increase the hugepages to get the test running.
> Otherwise simple memory allocations were failing. Some of structs have
> redundant fields which can be easily addressed.
> 
> For example please check below members in struct lcore_params, 1.
> scenario_id 2. lcore_id 3. dma_name 4. worker_id 5. test_secs
> 
> Also, some of the parameters in the above struct is const for a thread (like
> buf_size, kick_batch etc). So I was thinking may be it is better to split
> lcore_params into two portions and have only the dynamic part as volatile.
> But that is something we can take up later. I leave that to your judgement.
> 

[Cheng] OK, I'll take a look and try to make some improvements. Thanks for your advice.

> >
> > Thanks a lot,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Jiang, Cheng1
> > > Sent: Friday, June 16, 2023 11:16 PM
> > > To: Anoob Joseph <anoobj@marvell.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> > > <Xuan.Ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> Chenbo
> > > <Chenbo.Xia@intel.com>; Amit Prakash Shukla
> > > <amitprakashs@marvell.com>; Ma, WenwuX <WenwuX.Ma@intel.com>;
> > Wang,
> > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi Anoob,
> > >
> > > Replies are inline.
> > >
> > > Thanks,
> > > Cheng
> > >
> > > > -----Original Message-----
> > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > Sent: Friday, June 16, 2023 6:53 PM
> > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> > Chenbo
> > > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > > <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> > > Wang,
> > > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> > <xingguang.he@intel.com>
> > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > Hi Cheng,
> > > >
> > > > I think there is a bug in EAL parsing. Please check the below diff
> > > > and see if we should do something similar.
> > > >
> > > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > > > index
> > > > d65655b87b..7fcaa5acf6 100644
> > > > --- a/app/test-dma-perf/main.c
> > > > +++ b/app/test-dma-perf/main.c
> > > > @@ -432,7 +432,7 @@ append_eal_args(int argc, char **argv, const
> > > > char *eal_args, char **new_argv)
> > > >                         i++;
> > > >                         continue;
> > > >                 }
> > > > -               strlcpy(new_argv[new_argc], argv[i],
> > sizeof(new_argv[new_argc]));
> > > > +               strlcpy(new_argv[new_argc], argv[i],
> > > > + MAX_EAL_PARAM_LEN);
> > > >                 new_argc++;
> > > >         }
> > > >
> > > > Thanks,
> > > > Anoob
> > >
> > > [Cheng] yes there is an issue in it. And I have the same fix. I'll
> > > submit it later, thanks.
> > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Anoob Joseph <anoobj@marvell.com>
> > > > > Sent: Friday, June 16, 2023 3:19 PM
> > > > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > > > <xuan.ding@intel.com>; thomas@monjalon.net; Richardson, Bruce
> > > > > <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia,
> > > Chenbo
> > > > > <chenbo.xia@intel.com>; Amit Prakash Shukla
> > > > > <amitprakashs@marvell.com>; Ma, WenwuX
> > <wenwux.ma@intel.com>;
> > > > Wang,
> > > > > YuanX <yuanx.wang@intel.com>; He, Xingguang
> > > <xingguang.he@intel.com>
> > > > > Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf
> > > > > application
> > > > >
> > > > > Hi Cheng,
> > > > >
> > > > > > [Cheng] sure, no problem. Feel free to contact me if you have
> > > > > > any more questions.
> > > > >
> > > > > No further comments from my side. Can you submit next version?
> > >
> > > [Cheng] Glad to know. Yes, the next version is almost ready, I'll
> > > submit it this weekend.
> > >
> > > > >
> > > > > Top level, I think you may need to add a doc file for the new app.
> > >
> > > [Cheng] sure, I was thinking about this, and I'll add a doc for this, thanks.
> > >
> > > > >
> > > > > Few issues that we are also checking in our end, 1. The app is
> > > > > having significant memory footprint. Need to see where we can
> > improve.
> > > > > 2. EAL args passing doesn't seem to be working. Are you able to
> > > > > enable specific devices by making using of '-a' arg?
> > >
> > > [Cheng] I'll take a look at the significant memory footprint issue
> > > later,
> > thanks.
> > > As for the eal args process, yes there is an issue, Dengdui's
> > > comments also mentioned it, and I have fixed it in the v7 patch.
> > >
> > > > >
> > > > > Thanks,
> > > > > Anoob

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
  2023-06-19  5:48                           ` Anoob Joseph
@ 2023-06-19  6:21                             ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-19  6:21 UTC (permalink / raw)
  To: Anoob Joseph, Richardson, Bruce, thomas, Laatz, Kevin, Chengwen Feng
  Cc: dev, Hu, Jiayu, Ding, Xuan, mb, Xia, Chenbo, Amit Prakash Shukla,
	Ma, WenwuX, Wang, YuanX, He, Xingguang,
	Jerin Jacob Kollanukkaran

Hi Anoob,

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Monday, June 19, 2023 1:49 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; thomas@monjalon.net; Laatz, Kevin
> <kevin.laatz@intel.com>; Chengwen Feng <fengchengwen@huawei.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Ma, WenwuX <wenwux.ma@intel.com>;
> Wang, YuanX <yuanx.wang@intel.com>; He, Xingguang
> <xingguang.he@intel.com>; Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: RE: [EXT] [PATCH v6] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> > Do you think I should add a folder in the doc/guides/ just like
> > doc/guides/dma_perf?
> 
> I think it should be under doc/guides/tools. I'll let Thomas & DMA
> maintainers comment further.

OK, I'll try to add the doc in the tools folder first.

> 
> You can use below patch as a reference,
> 
> commit 1f5cfe964eefe96b5f8de1fadf9cc8fd1e214240
> Author: Anoob Joseph <anoobj@marvell.com>
> Date:   Thu Nov 3 18:16:11 2022 +0530
> 
>     app/security-perf: add session performance test
> 
> Thanks,
> Anoob

Sure, thanks a lot!
Cheng


^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v8] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (5 preceding siblings ...)
  2023-06-18 12:26 ` [PATCH v7] " Cheng Jiang
@ 2023-06-20  6:53 ` Cheng Jiang
  2023-06-23  6:52   ` [EXT] " Anoob Joseph
  2023-06-26  9:41 ` [PATCH v9] " Cheng Jiang
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 53+ messages in thread
From: Cheng Jiang @ 2023-06-20  6:53 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, fengchengwen, jerinj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	weix.ling, Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v8:
  fixed string copy issue in parse_lcore();
  improved some data display format;
  added doc in doc/guides/tools;
  updated release notes;

v7:
  fixed some strcpy issues;
  removed cache setup in calling rte_pktmbuf_pool_create();
  fixed some typos;
  added some memory free and null set operations;
  improved result calculation;
v6:
  improved code based on Anoob's comments;
  fixed some code structure issues;
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build                        |   1 +
 app/test-dma-perf/benchmark.c          | 498 +++++++++++++++++++++
 app/test-dma-perf/config.ini           |  61 +++
 app/test-dma-perf/main.c               | 594 +++++++++++++++++++++++++
 app/test-dma-perf/main.h               |  69 +++
 app/test-dma-perf/meson.build          |  17 +
 doc/guides/rel_notes/release_23_07.rst |   6 +
 doc/guides/tools/dmaperf.rst           | 103 +++++
 doc/guides/tools/index.rst             |   1 +
 9 files changed, 1350 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build
 create mode 100644 doc/guides/tools/dmaperf.rst

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..ac6aae4752
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,498 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+#define POLL_MAX 1000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+union lcore_params_union {
+	volatile struct lcore_params *v_ptr;
+	struct lcore_params *ptr;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static union lcore_params_union lcores_p[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	float ops;
+
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	ops = (float)total_cnt / test_secs;
+	*mops = ops / (1000 * 1000);
+	*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, float memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
+	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static inline void
+do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
+			volatile struct worker_info *worker_info)
+{
+	int ret;
+	uint16_t nr_cpl;
+
+	ret = rte_dma_submit(dev_id, 0);
+	if (ret < 0) {
+		rte_dma_stop(dev_id);
+		rte_dma_close(dev_id);
+		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
+	}
+
+	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+	*async_cnt -= nr_cpl;
+	worker_info->total_cpl += nr_cpl;
+}
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	const uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]),
+				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else {
+					/* Error exit */
+					rte_dma_stop(dev_id);
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	const uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint32_t nr_buf = para->nr_buf;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+		lcores_p[i].v_ptr = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!lcores_p[i].v_ptr) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			lcores_p[i].v_ptr->dma_name = ldm->dma_names[i];
+			lcores_p[i].v_ptr->dev_id = ldm->dma_ids[i];
+			lcores_p[i].v_ptr->kick_batch = kick_batch;
+		}
+		lcores_p[i].v_ptr->worker_id = i;
+		lcores_p[i].v_ptr->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		lcores_p[i].v_ptr->buf_size = buf_size;
+		lcores_p[i].v_ptr->test_secs = test_secs;
+		lcores_p[i].v_ptr->srcs = srcs + offset;
+		lcores_p[i].v_ptr->dsts = dsts + offset;
+		lcores_p[i].v_ptr->scenario_id = cfg->scenario_id;
+		lcores_p[i].v_ptr->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (lcores_p[i].v_ptr->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.test_cpl = lcores_p[i].v_ptr->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.test_cpl = lcores_p[i].v_ptr->worker_info.total_cpl -
+						lcores_p[i].v_ptr->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		lcores_p[i].v_ptr->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			lcores_p[i].v_ptr->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, lcores_p[i].v_ptr->lcore_id,
+					lcores_p[i].v_ptr->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free mbufs used in the test */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	/* free the points for the mbufs */
+	rte_free(srcs);
+	srcs = NULL;
+	rte_free(dsts);
+	dsts = NULL;
+
+	if (src_pool) {
+		rte_mempool_free(src_pool);
+		src_pool = NULL;
+	}
+	if (dst_pool) {
+		rte_mempool_free(dst_pool);
+		src_pool = NULL;
+	}
+
+	/* free the worker parameters */
+	for (i = 0; i < nb_workers; i++) {
+		rte_free(lcores_p[i].ptr);
+		lcores_p[i].ptr = NULL;
+	}
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..b550f4b23f
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
+;  64 and 4096.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first,last,increment,ADD|MUL.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+; We have to ensure a 1:1 mapping between the core and DMA device.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..d16e40571d
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size(B),nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Test Environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%.3lf Ghz",
+			rte_get_timer_hz() / 1000000000.0);
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else
+			break;
+	}
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	uint16_t len;
+	char *input;
+	struct lcore_dma_map_t *lcore_dma_map;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	len = strlen(value);
+	input = (char *)malloc((len + 1) * sizeof(char));
+	strlcpy(input, value, len + 1);
+	lcore_dma_map = &(test_case->lcore_dma_map);
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input, *addrs;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	input = strndup(value, strlen(value) + 1);
+	addrs = input;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {
+			fprintf(stderr, "Illegal DMA address\n");
+			ret = -1;
+			break;
+		}
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			fprintf(stderr, "lcores count error\n");
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1],
+				RTE_DEV_NAME_MAX_LEN);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr != 1 && args_nr != 4)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+
+	if (args_nr == 4) {
+		entry->last = (uint32_t)atoi(args[1]);
+		entry->incr = (uint32_t)atoi(args[2]);
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s.\n", args[3]);
+			args_nr = -1;
+		}
+	} else {
+		entry->op = OP_NONE;
+		entry->last = 0;
+		entry->incr = 0;
+	}
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, MAX_EAL_PARAM_LEN);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strlcpy(new_argv[new_argc++], tokens[i], MAX_EAL_PARAM_LEN);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		char *token = strtok(basename(rst_path), ".");
+		if (token == NULL) {
+			printf("Config file error.\n");
+			return -1;
+		}
+		strcat(token, "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
diff --git a/doc/guides/rel_notes/release_23_07.rst b/doc/guides/rel_notes/release_23_07.rst
index 027ae7bd2d..2355b558bc 100644
--- a/doc/guides/rel_notes/release_23_07.rst
+++ b/doc/guides/rel_notes/release_23_07.rst
@@ -170,6 +170,12 @@ New Features

   See :doc:`../prog_guide/pdcp_lib` for more information.

+* **Added DMA device performance test application.**
+
+  Added an new application to test the performance of DMA device and CPU.
+
+  See the :doc:`../tools/dmaperf` for more details.
+

 Removed Items
 -------------
diff --git a/doc/guides/tools/dmaperf.rst b/doc/guides/tools/dmaperf.rst
new file mode 100644
index 0000000000..c5f8a9406f
--- /dev/null
+++ b/doc/guides/tools/dmaperf.rst
@@ -0,0 +1,103 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2023 Intel Corporation.
+
+dpdk-test-dma-perf Application
+==============================
+
+The ``dpdk-test-dma-perf`` tool is a Data Plane Development Kit (DPDK) application that enables
+testing the performance of DMA (Direct Memory Access) devices available within DPDK. It provides a
+test framework to assess the performance of CPU and DMA devices under various scenarios, such as
+varying buffer lengths. Doing so provides insight into the potential performance when using these
+DMA devices for acceleration in DPDK applications. It supports memory copy performance tests for
+now, comparing the performance of CPU and DMA automatically in various conditions with the help of a
+pre-set configuration file.
+
+
+Configuration
+-------------
+This application uses inherent DPDK EAL command-line options as well as custom command-line options
+in the application. An example configuration file for the application is provided and gives the
+meanings for each parameter.
+
+Here is an extracted sample from the configuration file (the complete sample can be found in the
+application source directory):
+
+.. code-block:: ini
+
+   [case1]
+   type=DMA_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   dma_ring_size=1024
+   kick_batch=32
+   src_numa_node=0
+   dst_numa_node=0
+   cache_flush=0
+   test_seconds=2
+   lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+   eal_args=--in-memory --file-prefix=test
+
+   [case2]
+   type=CPU_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   src_numa_node=0
+   dst_numa_node=1
+   cache_flush=0
+   test_seconds=2
+   lcore = 3, 4
+   eal_args=--in-memory --no-pci
+
+The configuration file is divided into multiple sections, each section represents a test case.
+The four variables mem_size, buf_size, dma_ring_size, and kick_batch can vary in each test case.
+The format for this is ``variable=first,last,increment,ADD\|MUL``. This means that the first value
+of the variable is 'first', the last value is 'last', 'increment' is the step size, and ADD|MUL
+indicates whether the change is by addition or multiplication. Each case can only have one variable
+change, and each change will generate a scenario, so each case can have multiple scenarios.
+
+Parameter Definitions
+---------------------
+
+- **type**: The type of the test. Currently supported types are `DMA_MEM_COPY` and `CPU_MEM_COPY`.
+- **mem_size**: The size of the memory footprint.
+- **buf_size**: The memory size of a single operation.
+- **dma_ring_size**: The DMA ring buffer size. Must be a power of two, and between 64 and 4096.
+- **kick_batch**: The DMA operation batch size, should be greater than 1 normally.
+- **src_numa_node**: Controls the NUMA node where the source memory is allocated.
+- **dst_numa_node**: Controls the NUMA node where the destination memory is allocated.
+- **cache_flush**: Determines whether the cache should be flushed. `1` indicates to flush and `0` to not flush.
+- **test_seconds**: Controls the test time for each scenario.
+- **lcore_dma**: Specifies the lcore/DMA mapping.
+- **lcore**: Specifies the lcore for CPU testing.
+- **eal_args**: Specifies the EAL arguments.
+
+.. Note::
+
+	The mapping of lcore to DMA must be one-to-one and cannot be duplicated.
+
+To specify a configuration file, use the "\-\-config" flag followed by the path to the file.
+
+To specify a result file, use the "\-\-result" flag followed by the path to the file. If you do not
+specify a result file, one will be generated with the same name as the configuration file, with the
+addition of "_result.csv" at the end.
+
+
+Running the Application
+-----------------------
+
+Typical command-line invocation to execute the application:
+
+.. code-block:: console
+
+   dpdk-test-dma-perf --config=./config_dma.ini --result=./res_dma.csv
+
+Where `config_dma.ini` is the configuration file, and `res_dma.csv` will be the generated result
+file.
+
+After the tests, you can find the results in the `res_dma.csv` file.
+
+Limitations
+-----------
+
+Currently, this tool only supports memory copy performance tests. Additional enhancements are
+possible in the future to support more types of tests for DMA devices and CPUs.
diff --git a/doc/guides/tools/index.rst b/doc/guides/tools/index.rst
index 6f84fc31ff..857572da96 100644
--- a/doc/guides/tools/index.rst
+++ b/doc/guides/tools/index.rst
@@ -23,3 +23,4 @@ DPDK Tools User Guides
     testregex
     testmldev
     dts
+    dmaperf
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
  2023-06-20  6:53 ` [PATCH v8] " Cheng Jiang
@ 2023-06-23  6:52   ` Anoob Joseph
  2023-06-24 11:52     ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-23  6:52 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia,
	Amit Prakash Shukla, huangdengdui, kevin.laatz, fengchengwen,
	Jerin Jacob Kollanukkaran
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he, weix.ling

Hi Cheng,

Thanks for the new version. Please see inline.

Thanks,
Anoob

> -----Original Message-----
> From: Cheng Jiang <cheng1.jiang@intel.com>
> Sent: Tuesday, June 20, 2023 12:24 PM
> To: thomas@monjalon.net; bruce.richardson@intel.com;
> mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>;
> huangdengdui@huawei.com; kevin.laatz@intel.com;
> fengchengwen@huawei.com; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>
> Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> weix.ling@intel.com; Cheng Jiang <cheng1.jiang@intel.com>
> Subject: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
> 
> External Email
> 
> ----------------------------------------------------------------------
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v8:
>   fixed string copy issue in parse_lcore();
>   improved some data display format;
>   added doc in doc/guides/tools;
>   updated release notes;
> 
> v7:
>   fixed some strcpy issues;
>   removed cache setup in calling rte_pktmbuf_pool_create();
>   fixed some typos;
>   added some memory free and null set operations;
>   improved result calculation;
> v6:
>   improved code based on Anoob's comments;
>   fixed some code structure issues;
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build                        |   1 +
>  app/test-dma-perf/benchmark.c          | 498 +++++++++++++++++++++
>  app/test-dma-perf/config.ini           |  61 +++
>  app/test-dma-perf/main.c               | 594 +++++++++++++++++++++++++
>  app/test-dma-perf/main.h               |  69 +++
>  app/test-dma-perf/meson.build          |  17 +
>  doc/guides/rel_notes/release_23_07.rst |   6 +
>  doc/guides/tools/dmaperf.rst           | 103 +++++
>  doc/guides/tools/index.rst             |   1 +
>  9 files changed, 1350 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build  create mode 100644
> doc/guides/tools/dmaperf.rst
> 

<snip>

> +/* Configuration of device. */
> +static void
> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> +	uint16_t vchan = 0;
> +	struct rte_dma_info info;
> +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> +	struct rte_dma_vchan_conf qconf = {
> +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> +		.nb_desc = ring_size
> +	};
> +
> +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> +
> +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> +
> +	rte_dma_info_get(dev_id, &info);

[Anoob] This API can return errors. Better to add handling.

> +	if (info.nb_vchans != 1)
> +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> reported on device id. %u\n",
> +				dev_id);
> +
> +	if (rte_dma_start(dev_id) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> +
> +static int
> +config_dmadevs(struct test_configure *cfg) {
> +	uint32_t ring_size = cfg->ring_size.cur;
> +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	uint32_t nb_workers = ldm->cnt;
> +	uint32_t i;
> +	int dev_id;
> +	uint16_t nb_dmadevs = 0;
> +	char *dma_name;
> +
> +	for (i = 0; i < ldm->cnt; i++) {
> +		dma_name = ldm->dma_names[i];
> +		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> +		if (dev_id == -1) {

[Anoob] Can you check the above API definition? I think it returns not just -1 in case of errors.

> +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> dma_name);
> +			goto end;
> +		}
> +
> +		ldm->dma_ids[i] = dev_id;
> +		configure_dmadev_queue(dev_id, ring_size);
> +		++nb_dmadevs;
> +	}
> +
> +end:
> +	if (nb_dmadevs < nb_workers) {
> +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> nb_dmadevs, nb_workers);
> +		return -1;
> +	}
> +
> +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> +
> +	return 0;
> +}
> +
> +static inline void
> +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> +			volatile struct worker_info *worker_info) {
> +	int ret;
> +	uint16_t nr_cpl;
> +
> +	ret = rte_dma_submit(dev_id, 0);
> +	if (ret < 0) {
> +		rte_dma_stop(dev_id);
> +		rte_dma_close(dev_id);
> +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> +	}
> +
> +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> NULL);
> +	*async_cnt -= nr_cpl;
> +	worker_info->total_cpl += nr_cpl;
> +}
> +
> +static inline int
> +do_dma_mem_copy(void *p)

[Anoob] Just curious, why not pass struct lcore_params *para itself? Is it because the pointer is volatile? If yes, then we can take an AI to split the struct into volatile and non-volatile parts.

> +{
> +	const uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	const uint16_t dev_id = para->dev_id;
> +	const uint32_t nr_buf = para->nr_buf;
> +	const uint16_t kick_batch = para->kick_batch;
> +	const uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	uint16_t nr_cpl;
> +	uint64_t async_cnt = 0;
> +	uint32_t i;
> +	uint32_t poll_cnt = 0;
> +	int ret;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +dma_copy:
> +			ret = rte_dma_copy(dev_id, 0,
> rte_pktmbuf_iova(srcs[i]),
> +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);

[Anoob] Do we need to use ' rte_mbuf_data_iova' here instead of 'rte_pktmbuf_iova'? 

> +			if (unlikely(ret < 0)) {
> +				if (ret == -ENOSPC) {
> +					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> +					goto dma_copy;
> +				} else {
> +					/* Error exit */
> +					rte_dma_stop(dev_id);

[Anoob] Missing rte_dma_close() here. Also, may be introduce a static void function so that rte_exit call etc won't be part of the fastpath loop.

May be something like below and you can call it from here and "do_dma_submit_and_poll".

static void
error_exit(int dev_id)
{
	/* Error exit */
	rte_dma_stop(dev_id);
	rte_dma_close(dev_id);
	rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
}

> +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");
> +				}
> +			}
> +			async_cnt++;
> +
> +			if ((async_cnt % kick_batch) == 0)
> +				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +

<snip>

> +static int
> +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> +			struct rte_mbuf ***dsts)
> +{
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	unsigned int nr_sockets;
> +	uint32_t nr_buf = cfg->nr_buf;
> +
> +	nr_sockets = rte_socket_count();
> +	if (cfg->src_numa_node >= nr_sockets ||
> +		cfg->dst_numa_node >= nr_sockets) {
> +		printf("Error: Source or destination numa exceeds the acture
> numa nodes.\n");
> +		return -1;
> +	}
> +
> +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> +			nr_buf,
> +			0,
> +			0,
> +			buf_size + RTE_PKTMBUF_HEADROOM,
> +			cfg->src_numa_node);
> +	if (src_pool == NULL) {
> +		PRINT_ERR("Error with source mempool creation.\n");
> +		return -1;
> +	}
> +
> +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> +			nr_buf,
> +			0,
> +			0,
> +			buf_size + RTE_PKTMBUF_HEADROOM,
> +			cfg->dst_numa_node);
> +	if (dst_pool == NULL) {
> +		PRINT_ERR("Error with destination mempool creation.\n");
> +		return -1;
> +	}
> +
> +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	if (*srcs == NULL) {
> +		printf("Error: srcs malloc failed.\n");
> +		return -1;
> +	}
> +
> +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	if (*dsts == NULL) {
> +		printf("Error: dsts malloc failed.\n");
> +		return -1;
> +	}
> +
> +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {

[Anoob] Might be better to use 'rte_pktmbuf_alloc_bulk' since we use ' rte_mbuf_data_iova' in the datapath and it is desirable to initialize it properly as an mbuf. Same comment below as well.

<snip>

> +
> +	for (i = 0; i < nb_workers; i++) {
> +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> +			lcores_p[i].v_ptr->worker_info.test_cpl,
> +			&memory, &avg_cycles, &bandwidth, &mops);
> +		output_result(cfg->scenario_id, lcores_p[i].v_ptr->lcore_id,
> +					lcores_p[i].v_ptr->dma_name,
> avg_cycles, buf_size,
> +					nr_buf / nb_workers, memory,
> bandwidth, mops, is_dma);
> +	}

[Anoob] Can you also print total_bandwidth & total_mops? It can be a simple aggregation in the above loop. Would help when we are dealing with larger number of queues but single hardware block.

> +
> +out:
> +	/* free mbufs used in the test */
> +	if (srcs)

[Anoob] DPDK coding guidelines recommend a usage like below,
	(if (srcs != NULL)

> +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> +	if (dsts)
> +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> +
> +	/* free the points for the mbufs */
> +	rte_free(srcs);
> +	srcs = NULL;
> +	rte_free(dsts);
> +	dsts = NULL;
> +
> +	if (src_pool) {
> +		rte_mempool_free(src_pool);
> +		src_pool = NULL;
> +	}
> +	if (dst_pool) {
> +		rte_mempool_free(dst_pool);
> +		src_pool = NULL;

[Anoob] Should be dst_pool, right?

<snip>

> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new file
> mode 100644 index 0000000000..215ac42673
> --- /dev/null
> +++ b/app/test-dma-perf/main.h
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Intel Corporation
> + */
> +
> +#ifndef _MAIN_H_
> +#define _MAIN_H_
> +
> +
> +#include <rte_common.h>
> +#include <rte_cycles.h>
> +#include <rte_dev.h>
> +#include <rte_dmadev.h>

[Anoob] Is the above include (rte_dmadev.h) required?

> +
> +#ifndef __maybe_unused
> +#define __maybe_unused	__rte_unused
> +#endif

[Anoob] Can you try to avoid this and use rte_unused or RTE_SET_USED in the cache_flush_buf() function?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
  2023-06-23  6:52   ` [EXT] " Anoob Joseph
@ 2023-06-24 11:52     ` Jiang, Cheng1
  2023-06-26  5:41       ` Anoob Joseph
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-24 11:52 UTC (permalink / raw)
  To: Anoob Joseph, thomas, Richardson, Bruce, mb, Xia, Chenbo,
	Amit Prakash Shukla, huangdengdui, Laatz, Kevin, fengchengwen,
	Jerin Jacob Kollanukkaran
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Ling, WeiX

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Friday, June 23, 2023 2:53 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> Prakash Shukla <amitprakashs@marvell.com>; huangdengdui@huawei.com;
> Laatz, Kevin <kevin.laatz@intel.com>; fengchengwen@huawei.com; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Ling, WeiX <weix.ling@intel.com>
> Subject: RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Thanks for the new version. Please see inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Cheng Jiang <cheng1.jiang@intel.com>
> > Sent: Tuesday, June 20, 2023 12:24 PM
> > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash Shukla
> > <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>;
> > huangdengdui@huawei.com; kevin.laatz@intel.com;
> > fengchengwen@huawei.com; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>
> > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> > weix.ling@intel.com; Cheng Jiang <cheng1.jiang@intel.com>
> > Subject: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre- set config file. Memory Copy performance test
> are supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v8:
> >   fixed string copy issue in parse_lcore();
> >   improved some data display format;
> >   added doc in doc/guides/tools;
> >   updated release notes;
> >
> > v7:
> >   fixed some strcpy issues;
> >   removed cache setup in calling rte_pktmbuf_pool_create();
> >   fixed some typos;
> >   added some memory free and null set operations;
> >   improved result calculation;
> > v6:
> >   improved code based on Anoob's comments;
> >   fixed some code structure issues;
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build                        |   1 +
> >  app/test-dma-perf/benchmark.c          | 498 +++++++++++++++++++++
> >  app/test-dma-perf/config.ini           |  61 +++
> >  app/test-dma-perf/main.c               | 594 +++++++++++++++++++++++++
> >  app/test-dma-perf/main.h               |  69 +++
> >  app/test-dma-perf/meson.build          |  17 +
> >  doc/guides/rel_notes/release_23_07.rst |   6 +
> >  doc/guides/tools/dmaperf.rst           | 103 +++++
> >  doc/guides/tools/index.rst             |   1 +
> >  9 files changed, 1350 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> > 100644 app/test-dma-perf/meson.build  create mode 100644
> > doc/guides/tools/dmaperf.rst
> >
> 
> <snip>
> 
> > +/* Configuration of device. */
> > +static void
> > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > +	uint16_t vchan = 0;
> > +	struct rte_dma_info info;
> > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > +	struct rte_dma_vchan_conf qconf = {
> > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > +		.nb_desc = ring_size
> > +	};
> > +
> > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> > +
> > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> > +
> > +	rte_dma_info_get(dev_id, &info);
> 
> [Anoob] This API can return errors. Better to add handling.

[Cheng] Sure, I'll fix it in the next version.

> 
> > +	if (info.nb_vchans != 1)
> > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > reported on device id. %u\n",
> > +				dev_id);
> > +
> > +	if (rte_dma_start(dev_id) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > +
> > +static int
> > +config_dmadevs(struct test_configure *cfg) {
> > +	uint32_t ring_size = cfg->ring_size.cur;
> > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	uint32_t nb_workers = ldm->cnt;
> > +	uint32_t i;
> > +	int dev_id;
> > +	uint16_t nb_dmadevs = 0;
> > +	char *dma_name;
> > +
> > +	for (i = 0; i < ldm->cnt; i++) {
> > +		dma_name = ldm->dma_names[i];
> > +		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> > +		if (dev_id == -1) {
> 
> [Anoob] Can you check the above API definition? I think it returns not just -1
> in case of errors.

[Cheng] Yes, you are right, I'll fix it in the next version. Thanks a lot.

> 
> > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > dma_name);
> > +			goto end;
> > +		}
> > +
> > +		ldm->dma_ids[i] = dev_id;
> > +		configure_dmadev_queue(dev_id, ring_size);
> > +		++nb_dmadevs;
> > +	}
> > +
> > +end:
> > +	if (nb_dmadevs < nb_workers) {
> > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> > nb_dmadevs, nb_workers);
> > +		return -1;
> > +	}
> > +
> > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > +
> > +	return 0;
> > +}
> > +
> > +static inline void
> > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > +			volatile struct worker_info *worker_info) {
> > +	int ret;
> > +	uint16_t nr_cpl;
> > +
> > +	ret = rte_dma_submit(dev_id, 0);
> > +	if (ret < 0) {
> > +		rte_dma_stop(dev_id);
> > +		rte_dma_close(dev_id);
> > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > +	}
> > +
> > +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> > NULL);
> > +	*async_cnt -= nr_cpl;
> > +	worker_info->total_cpl += nr_cpl;
> > +}
> > +
> > +static inline int
> > +do_dma_mem_copy(void *p)
> 
> [Anoob] Just curious, why not pass struct lcore_params *para itself? Is it
> because the pointer is volatile? If yes, then we can take an AI to split the
> struct into volatile and non-volatile parts.

[Cheng] The reason I did it this way is because I want to launch this function on another core by spawning a new thread, and rte_eal_remote_launch() takes a void * as the parameter. That's why I passed void *p.  Your suggestion to split the struct into volatile and non-volatile parts is quite reasonable. I am thinking about the best way to implement it. Thanks.

> 
> > +{
> > +	const uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	const uint16_t dev_id = para->dev_id;
> > +	const uint32_t nr_buf = para->nr_buf;
> > +	const uint16_t kick_batch = para->kick_batch;
> > +	const uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	uint16_t nr_cpl;
> > +	uint64_t async_cnt = 0;
> > +	uint32_t i;
> > +	uint32_t poll_cnt = 0;
> > +	int ret;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +dma_copy:
> > +			ret = rte_dma_copy(dev_id, 0,
> > rte_pktmbuf_iova(srcs[i]),
> > +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> 
> [Anoob] Do we need to use ' rte_mbuf_data_iova' here instead of
> 'rte_pktmbuf_iova'?

[Cheng] yes rte_mbuf_data_iova is more appropriate, I'll fix it in the next version. Thanks.

> 
> > +			if (unlikely(ret < 0)) {
> > +				if (ret == -ENOSPC) {
> > +					do_dma_submit_and_poll(dev_id,
> > &async_cnt, worker_info);
> > +					goto dma_copy;
> > +				} else {
> > +					/* Error exit */
> > +					rte_dma_stop(dev_id);
> 
> [Anoob] Missing rte_dma_close() here. Also, may be introduce a static void
> function so that rte_exit call etc won't be part of the fastpath loop.
> 
> May be something like below and you can call it from here and
> "do_dma_submit_and_poll".
> 
> static void
> error_exit(int dev_id)
> {
> 	/* Error exit */
> 	rte_dma_stop(dev_id);
> 	rte_dma_close(dev_id);
> 	rte_exit(EXIT_FAILURE, "DMA enqueue failed\n"); }
> 

[Cheng] I'm not so sure here. rte_dma_close() is called in the rte_exit(). Do we still call it explicitly before rte_exit()?

> > +					rte_exit(EXIT_FAILURE, "DMA
> > enqueue failed\n");
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			if ((async_cnt % kick_batch) == 0)
> > +				do_dma_submit_and_poll(dev_id,
> > &async_cnt, worker_info);
> > +		}
> > +
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	rte_dma_submit(dev_id, 0);
> > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +		async_cnt -= nr_cpl;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> 
> <snip>
> 
> > +static int
> > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> > +			struct rte_mbuf ***dsts)
> > +{
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	unsigned int nr_sockets;
> > +	uint32_t nr_buf = cfg->nr_buf;
> > +
> > +	nr_sockets = rte_socket_count();
> > +	if (cfg->src_numa_node >= nr_sockets ||
> > +		cfg->dst_numa_node >= nr_sockets) {
> > +		printf("Error: Source or destination numa exceeds the acture
> > numa nodes.\n");
> > +		return -1;
> > +	}
> > +
> > +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > +			nr_buf,
> > +			0,
> > +			0,
> > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > +			cfg->src_numa_node);
> > +	if (src_pool == NULL) {
> > +		PRINT_ERR("Error with source mempool creation.\n");
> > +		return -1;
> > +	}
> > +
> > +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > +			nr_buf,
> > +			0,
> > +			0,
> > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > +			cfg->dst_numa_node);
> > +	if (dst_pool == NULL) {
> > +		PRINT_ERR("Error with destination mempool creation.\n");
> > +		return -1;
> > +	}
> > +
> > +	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > +	if (*srcs == NULL) {
> > +		printf("Error: srcs malloc failed.\n");
> > +		return -1;
> > +	}
> > +
> > +	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> > +	if (*dsts == NULL) {
> > +		printf("Error: dsts malloc failed.\n");
> > +		return -1;
> > +	}
> > +
> > +	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
> 
> [Anoob] Might be better to use 'rte_pktmbuf_alloc_bulk' since we use '
> rte_mbuf_data_iova' in the datapath and it is desirable to initialize it properly
> as an mbuf. Same comment below as well.
> 

[Cheng] sure, I'll fix it in the next version.

> <snip>
> 
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > +			lcores_p[i].v_ptr->worker_info.test_cpl,
> > +			&memory, &avg_cycles, &bandwidth, &mops);
> > +		output_result(cfg->scenario_id, lcores_p[i].v_ptr->lcore_id,
> > +					lcores_p[i].v_ptr->dma_name,
> > avg_cycles, buf_size,
> > +					nr_buf / nb_workers, memory,
> > bandwidth, mops, is_dma);
> > +	}
> 
> [Anoob] Can you also print total_bandwidth & total_mops? It can be a simple
> aggregation in the above loop. Would help when we are dealing with larger
> number of queues but single hardware block.

[Cheng] sure, good point. I'll add it in the next version, thanks.

> 
> > +
> > +out:
> > +	/* free mbufs used in the test */
> > +	if (srcs)
> 
> [Anoob] DPDK coding guidelines recommend a usage like below,
> 	(if (srcs != NULL)
> 

[Cheng] sure, thanks. I'll fix it in the next version.

> > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > +	if (dsts)
> > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > +
> > +	/* free the points for the mbufs */
> > +	rte_free(srcs);
> > +	srcs = NULL;
> > +	rte_free(dsts);
> > +	dsts = NULL;
> > +
> > +	if (src_pool) {
> > +		rte_mempool_free(src_pool);
> > +		src_pool = NULL;
> > +	}
> > +	if (dst_pool) {
> > +		rte_mempool_free(dst_pool);
> > +		src_pool = NULL;
> 
> [Anoob] Should be dst_pool, right?

[Cheng] yes, sorry for the miss. I'll fix it in the next version.

> 
> <snip>
> 
> > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new
> > file mode 100644 index 0000000000..215ac42673
> > --- /dev/null
> > +++ b/app/test-dma-perf/main.h
> > @@ -0,0 +1,69 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2023 Intel Corporation  */
> > +
> > +#ifndef _MAIN_H_
> > +#define _MAIN_H_
> > +
> > +
> > +#include <rte_common.h>
> > +#include <rte_cycles.h>
> > +#include <rte_dev.h>
> > +#include <rte_dmadev.h>
> 
> [Anoob] Is the above include (rte_dmadev.h) required?

[Cheng] you are right. It's not required. I'll remove it in the next version.

> 
> > +
> > +#ifndef __maybe_unused
> > +#define __maybe_unused	__rte_unused
> > +#endif
> 
> [Anoob] Can you try to avoid this and use rte_unused or RTE_SET_USED in
> the cache_flush_buf() function?

[Cheng] sure, I'll fix it in the next version.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
  2023-06-24 11:52     ` Jiang, Cheng1
@ 2023-06-26  5:41       ` Anoob Joseph
  2023-06-26 10:02         ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Anoob Joseph @ 2023-06-26  5:41 UTC (permalink / raw)
  To: Jiang, Cheng1, thomas, Richardson, Bruce, mb, Xia, Chenbo,
	Amit Prakash Shukla, huangdengdui, Laatz, Kevin, fengchengwen,
	Jerin Jacob Kollanukkaran
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Ling, WeiX

Hi Cheng,

Please see inline.

Thanks,
Anoob

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Saturday, June 24, 2023 5:23 PM
> To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> Prakash Shukla <amitprakashs@marvell.com>; huangdengdui@huawei.com;
> Laatz, Kevin <kevin.laatz@intel.com>; fengchengwen@huawei.com; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Ling, WeiX <weix.ling@intel.com>
> Subject: RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
> 
> Hi Anoob,
> 
> Replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Anoob Joseph <anoobj@marvell.com>
> > Sent: Friday, June 23, 2023 2:53 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> > Prakash Shukla <amitprakashs@marvell.com>;
> huangdengdui@huawei.com;
> > Laatz, Kevin <kevin.laatz@intel.com>; fengchengwen@huawei.com; Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Ling,
> > WeiX <weix.ling@intel.com>
> > Subject: RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Cheng,
> >
> > Thanks for the new version. Please see inline.
> >
> > Thanks,
> > Anoob
> >
> > > -----Original Message-----
> > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > Sent: Tuesday, June 20, 2023 12:24 PM
> > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash
> Shukla
> > > <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>;
> > > huangdengdui@huawei.com; kevin.laatz@intel.com;
> > > fengchengwen@huawei.com; Jerin Jacob Kollanukkaran
> > > <jerinj@marvell.com>
> > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> xingguang.he@intel.com;
> > > weix.ling@intel.com; Cheng Jiang <cheng1.jiang@intel.com>
> > > Subject: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > -- There are many high-performance DMA devices supported in DPDK
> > > now,
> > and
> > > these DMA devices can also be integrated into other modules of DPDK
> > > as accelerators, such as Vhost. Before integrating DMA into
> > > applications, developers need to know the performance of these DMA
> > > devices in various scenarios and the performance of CPUs in the same
> > > scenario, such as different buffer lengths. Only in this way can we
> > > know the target performance of the application accelerated by using
> > > them. This patch introduces a high-performance testing tool, which
> > > supports comparing the performance of CPU and DMA in different
> > > scenarios automatically with a pre- set config file. Memory Copy
> > > performance test
> > are supported for now.
> > >
> > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > ---
> > > v8:
> > >   fixed string copy issue in parse_lcore();
> > >   improved some data display format;
> > >   added doc in doc/guides/tools;
> > >   updated release notes;
> > >
> > > v7:
> > >   fixed some strcpy issues;
> > >   removed cache setup in calling rte_pktmbuf_pool_create();
> > >   fixed some typos;
> > >   added some memory free and null set operations;
> > >   improved result calculation;
> > > v6:
> > >   improved code based on Anoob's comments;
> > >   fixed some code structure issues;
> > > v5:
> > >   fixed some LONG_LINE warnings;
> > > v4:
> > >   fixed inaccuracy of the memory footprint display;
> > > v3:
> > >   fixed some typos;
> > > v2:
> > >   added lcore/dmadev designation;
> > >   added error case process;
> > >   removed worker_threads parameter from config.ini;
> > >   improved the logs;
> > >   improved config file;
> > >
> > >  app/meson.build                        |   1 +
> > >  app/test-dma-perf/benchmark.c          | 498 +++++++++++++++++++++
> > >  app/test-dma-perf/config.ini           |  61 +++
> > >  app/test-dma-perf/main.c               | 594 +++++++++++++++++++++++++
> > >  app/test-dma-perf/main.h               |  69 +++
> > >  app/test-dma-perf/meson.build          |  17 +
> > >  doc/guides/rel_notes/release_23_07.rst |   6 +
> > >  doc/guides/tools/dmaperf.rst           | 103 +++++
> > >  doc/guides/tools/index.rst             |   1 +
> > >  9 files changed, 1350 insertions(+)  create mode 100644
> > > app/test-dma-perf/benchmark.c  create mode 100644
> > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > mode
> > > 100644 app/test-dma-perf/meson.build  create mode 100644
> > > doc/guides/tools/dmaperf.rst
> > >
> >

<snip>
 
> >
> > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > dma_name);
> > > +			goto end;
> > > +		}
> > > +
> > > +		ldm->dma_ids[i] = dev_id;
> > > +		configure_dmadev_queue(dev_id, ring_size);
> > > +		++nb_dmadevs;
> > > +	}
> > > +
> > > +end:
> > > +	if (nb_dmadevs < nb_workers) {
> > > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> > > nb_dmadevs, nb_workers);
> > > +		return -1;
> > > +	}
> > > +
> > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static inline void
> > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > > +			volatile struct worker_info *worker_info) {
> > > +	int ret;
> > > +	uint16_t nr_cpl;
> > > +
> > > +	ret = rte_dma_submit(dev_id, 0);
> > > +	if (ret < 0) {
> > > +		rte_dma_stop(dev_id);
> > > +		rte_dma_close(dev_id);
> > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > +	}
> > > +
> > > +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> > > NULL);
> > > +	*async_cnt -= nr_cpl;
> > > +	worker_info->total_cpl += nr_cpl;
> > > +}
> > > +
> > > +static inline int
> > > +do_dma_mem_copy(void *p)
> >
> > [Anoob] Just curious, why not pass struct lcore_params *para itself?
> > Is it because the pointer is volatile? If yes, then we can take an AI
> > to split the struct into volatile and non-volatile parts.
> 
> [Cheng] The reason I did it this way is because I want to launch this function
> on another core by spawning a new thread, and rte_eal_remote_launch()
> takes a void * as the parameter. That's why I passed void *p.  Your
> suggestion to split the struct into volatile and non-volatile parts is quite
> reasonable. I am thinking about the best way to implement it. Thanks.

[Anoob] Instead of passing the address of index variable as void *, you can easily send lcore_params pointer, right?

> 
> >
> > > +{
> > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > +	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
> > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > +	const uint16_t dev_id = para->dev_id;
> > > +	const uint32_t nr_buf = para->nr_buf;
> > > +	const uint16_t kick_batch = para->kick_batch;
> > > +	const uint32_t buf_size = para->buf_size;
> > > +	struct rte_mbuf **srcs = para->srcs;
> > > +	struct rte_mbuf **dsts = para->dsts;
> > > +	uint16_t nr_cpl;
> > > +	uint64_t async_cnt = 0;
> > > +	uint32_t i;
> > > +	uint32_t poll_cnt = 0;
> > > +	int ret;
> > > +
> > > +	worker_info->stop_flag = false;
> > > +	worker_info->ready_flag = true;
> > > +
> > > +	while (!worker_info->start_flag)
> > > +		;
> > > +
> > > +	while (1) {
> > > +		for (i = 0; i < nr_buf; i++) {
> > > +dma_copy:
> > > +			ret = rte_dma_copy(dev_id, 0,
> > > rte_pktmbuf_iova(srcs[i]),
> > > +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> >
> > [Anoob] Do we need to use ' rte_mbuf_data_iova' here instead of
> > 'rte_pktmbuf_iova'?
> 
> [Cheng] yes rte_mbuf_data_iova is more appropriate, I'll fix it in the next
> version. Thanks.
> 
> >
> > > +			if (unlikely(ret < 0)) {
> > > +				if (ret == -ENOSPC) {
> > > +					do_dma_submit_and_poll(dev_id,
> > > &async_cnt, worker_info);
> > > +					goto dma_copy;
> > > +				} else {
> > > +					/* Error exit */
> > > +					rte_dma_stop(dev_id);
> >
> > [Anoob] Missing rte_dma_close() here. Also, may be introduce a static
> > void function so that rte_exit call etc won't be part of the fastpath loop.
> >
> > May be something like below and you can call it from here and
> > "do_dma_submit_and_poll".
> >
> > static void
> > error_exit(int dev_id)
> > {
> > 	/* Error exit */
> > 	rte_dma_stop(dev_id);
> > 	rte_dma_close(dev_id);
> > 	rte_exit(EXIT_FAILURE, "DMA enqueue failed\n"); }
> >
> 
> [Cheng] I'm not so sure here. rte_dma_close() is called in the rte_exit(). Do
> we still call it explicitly before rte_exit()?

[Anoob] In ' do_dma_submit_and_poll', there is rte_dma_close() before rte_exit(). I'm fine either way as long is it is consistent. Said that, I think it is better to call close() from app, rather than relying on rte_exit.

<snip>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v9] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (6 preceding siblings ...)
  2023-06-20  6:53 ` [PATCH v8] " Cheng Jiang
@ 2023-06-26  9:41 ` Cheng Jiang
  2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
  2023-06-29 13:14 ` [PATCH v11] " Cheng Jiang
  9 siblings, 0 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-06-26  9:41 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, fengchengwen, jerinj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	weix.ling, Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v9:
  improved error handling;
  improved lcore_params structure;
  improved mbuf api calling;
  improved exit process;
  fixed some typos;
  added scenario summary data display;
  removed unnecessary include;
v8:
  fixed string copy issue in parse_lcore();
  improved some data display format;
  added doc in doc/guides/tools;
  updated release notes;
v7:
  fixed some strcpy issues;
  removed cache setup in calling rte_pktmbuf_pool_create();
  fixed some typos;
  added some memory free and null set operations;
  improved result calculation;
v6:
  improved code based on Anoob's comments;
  fixed some code structure issues;
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build                        |   1 +
 app/test-dma-perf/benchmark.c          | 508 ++++++++++++++++++++
 app/test-dma-perf/config.ini           |  61 +++
 app/test-dma-perf/main.c               | 616 +++++++++++++++++++++++++
 app/test-dma-perf/main.h               |  64 +++
 app/test-dma-perf/meson.build          |  17 +
 doc/guides/rel_notes/release_23_07.rst |   6 +
 doc/guides/tools/dmaperf.rst           | 103 +++++
 doc/guides/tools/index.rst             |   1 +
 9 files changed, 1377 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build
 create mode 100644 doc/guides/tools/dmaperf.rst

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..0601e0d171
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,508 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+#define POLL_MAX 1000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+#define CSV_TOTAL_LINE_FMT "Scenario %u Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	volatile struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static struct lcore_params *lcores[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	float ops;
+
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	ops = (float)total_cnt / test_secs;
+	*mops = ops / (1000 * 1000);
+	*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
+			float memory, float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
+				lcore_id, dma_name, ring_size, kick_batch);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
+	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__rte_unused struct rte_mbuf **array,
+		__rte_unused uint32_t buf_size,
+		__rte_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	if (rte_dma_info_get(dev_id, &info) != 0)
+		rte_exit(EXIT_FAILURE, "Error with getting device info.\n");
+
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id < 0) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static void
+error_exit(int dev_id)
+{
+	rte_dma_stop(dev_id);
+	rte_dma_close(dev_id);
+	rte_exit(EXIT_FAILURE, "DMA error\n");
+}
+
+static inline void
+do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
+			volatile struct worker_info *worker_info)
+{
+	int ret;
+	uint16_t nr_cpl;
+
+	ret = rte_dma_submit(dev_id, 0);
+	if (ret < 0)
+		error_exit(dev_id);
+
+	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+	*async_cnt -= nr_cpl;
+	worker_info->total_cpl += nr_cpl;
+}
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
+				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint32_t nr_buf = para->nr_buf;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src mbufs failed.\n");
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory = 0;
+	uint32_t avg_cycles = 0;
+	uint32_t avg_cycles_total;
+	float mops, mops_total;
+	float bandwidth, bandwidth_total;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush == 1) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (lcores[i] == NULL) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			lcores[i]->dma_name = ldm->dma_names[i];
+			lcores[i]->dev_id = ldm->dma_ids[i];
+			lcores[i]->kick_batch = kick_batch;
+		}
+		lcores[i]->worker_id = i;
+		lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		lcores[i]->buf_size = buf_size;
+		lcores[i]->test_secs = test_secs;
+		lcores[i]->srcs = srcs + offset;
+		lcores[i]->dsts = dsts + offset;
+		lcores[i]->scenario_id = cfg->scenario_id;
+		lcores[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (lcores[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl -
+						lcores[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	mops_total = 0;
+	bandwidth_total = 0;
+	avg_cycles_total = 0;
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			lcores[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, lcores[i]->lcore_id,
+					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
+					avg_cycles, buf_size, nr_buf / nb_workers, memory,
+					bandwidth, mops, is_dma);
+		mops_total += mops;
+		bandwidth_total += bandwidth;
+		avg_cycles_total += avg_cycles;
+	}
+	printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n", bandwidth_total, mops_total);
+	snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN, CSV_TOTAL_LINE_FMT,
+			cfg->scenario_id, nr_buf, memory * nb_workers,
+			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
+
+out:
+	/* free mbufs used in the test */
+	if (srcs != NULL)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts != NULL)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	/* free the points for the mbufs */
+	rte_free(srcs);
+	srcs = NULL;
+	rte_free(dsts);
+	dsts = NULL;
+
+	rte_mempool_free(src_pool);
+	src_pool = NULL;
+
+	rte_mempool_free(dst_pool);
+	dst_pool = NULL;
+
+	/* free the worker parameters */
+	for (i = 0; i < nb_workers; i++) {
+		rte_free(lcores[i]);
+		lcores[i] = NULL;
+	}
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..b550f4b23f
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
+;  64 and 4096.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first,last,increment,ADD|MUL.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+; We have to ensure a 1:1 mapping between the core and DMA device.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..de37120df6
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,DMA ring size,kick batch size,buffer size(B),number of buffers,memory(MB),average cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Test Environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%.3lf Ghz",
+			rte_get_timer_hz() / 1000000000.0);
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else {
+			printf("No proper operation for variable entry.\n");
+			break;
+		}
+	}
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	uint16_t len;
+	char *input;
+	struct lcore_dma_map_t *lcore_dma_map;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	len = strlen(value);
+	input = (char *)malloc((len + 1) * sizeof(char));
+	strlcpy(input, value, len + 1);
+	lcore_dma_map = &(test_case->lcore_dma_map);
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input, *addrs;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	input = strndup(value, strlen(value) + 1);
+	addrs = input;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {
+			fprintf(stderr, "Illegal DMA address\n");
+			ret = -1;
+			break;
+		}
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			fprintf(stderr, "lcores count error\n");
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1],
+				RTE_DEV_NAME_MAX_LEN);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+	int ret;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	ret = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (ret != 1 && ret != 4)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+
+	if (ret == 4) {
+		args_nr = 4;
+		entry->last = (uint32_t)atoi(args[1]);
+		entry->incr = (uint32_t)atoi(args[2]);
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			args_nr = -1;
+			printf("Invalid op %s.\n", args[3]);
+		}
+
+	} else {
+		args_nr = 1;
+		entry->op = OP_NONE;
+		entry->last = 0;
+		entry->incr = 0;
+	}
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (case_type == NULL) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Wrong test case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+		nb_vp = 0;
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr == 4)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr == 4)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr == 4)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr == 4)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Case %d error, each section can only have a single variable parameter.\n",
+					i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(uint8_t)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, MAX_EAL_PARAM_LEN);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strlcpy(new_argv[new_argc++], tokens[i], MAX_EAL_PARAM_LEN);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		char *token = strtok(basename(rst_path), ".");
+		if (token == NULL) {
+			printf("Config file error.\n");
+			return -1;
+		}
+		strcat(token, "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	printf("Running cases...\n");
+	for (i = 0; i < case_nb; i++) {
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No valid test type in test case %d.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			output_env_info();
+
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..12bc3f4e3f
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint8_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
diff --git a/doc/guides/rel_notes/release_23_07.rst b/doc/guides/rel_notes/release_23_07.rst
index 027ae7bd2d..2355b558bc 100644
--- a/doc/guides/rel_notes/release_23_07.rst
+++ b/doc/guides/rel_notes/release_23_07.rst
@@ -170,6 +170,12 @@ New Features

   See :doc:`../prog_guide/pdcp_lib` for more information.

+* **Added DMA device performance test application.**
+
+  Added an new application to test the performance of DMA device and CPU.
+
+  See the :doc:`../tools/dmaperf` for more details.
+

 Removed Items
 -------------
diff --git a/doc/guides/tools/dmaperf.rst b/doc/guides/tools/dmaperf.rst
new file mode 100644
index 0000000000..c5f8a9406f
--- /dev/null
+++ b/doc/guides/tools/dmaperf.rst
@@ -0,0 +1,103 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2023 Intel Corporation.
+
+dpdk-test-dma-perf Application
+==============================
+
+The ``dpdk-test-dma-perf`` tool is a Data Plane Development Kit (DPDK) application that enables
+testing the performance of DMA (Direct Memory Access) devices available within DPDK. It provides a
+test framework to assess the performance of CPU and DMA devices under various scenarios, such as
+varying buffer lengths. Doing so provides insight into the potential performance when using these
+DMA devices for acceleration in DPDK applications. It supports memory copy performance tests for
+now, comparing the performance of CPU and DMA automatically in various conditions with the help of a
+pre-set configuration file.
+
+
+Configuration
+-------------
+This application uses inherent DPDK EAL command-line options as well as custom command-line options
+in the application. An example configuration file for the application is provided and gives the
+meanings for each parameter.
+
+Here is an extracted sample from the configuration file (the complete sample can be found in the
+application source directory):
+
+.. code-block:: ini
+
+   [case1]
+   type=DMA_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   dma_ring_size=1024
+   kick_batch=32
+   src_numa_node=0
+   dst_numa_node=0
+   cache_flush=0
+   test_seconds=2
+   lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+   eal_args=--in-memory --file-prefix=test
+
+   [case2]
+   type=CPU_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   src_numa_node=0
+   dst_numa_node=1
+   cache_flush=0
+   test_seconds=2
+   lcore = 3, 4
+   eal_args=--in-memory --no-pci
+
+The configuration file is divided into multiple sections, each section represents a test case.
+The four variables mem_size, buf_size, dma_ring_size, and kick_batch can vary in each test case.
+The format for this is ``variable=first,last,increment,ADD\|MUL``. This means that the first value
+of the variable is 'first', the last value is 'last', 'increment' is the step size, and ADD|MUL
+indicates whether the change is by addition or multiplication. Each case can only have one variable
+change, and each change will generate a scenario, so each case can have multiple scenarios.
+
+Parameter Definitions
+---------------------
+
+- **type**: The type of the test. Currently supported types are `DMA_MEM_COPY` and `CPU_MEM_COPY`.
+- **mem_size**: The size of the memory footprint.
+- **buf_size**: The memory size of a single operation.
+- **dma_ring_size**: The DMA ring buffer size. Must be a power of two, and between 64 and 4096.
+- **kick_batch**: The DMA operation batch size, should be greater than 1 normally.
+- **src_numa_node**: Controls the NUMA node where the source memory is allocated.
+- **dst_numa_node**: Controls the NUMA node where the destination memory is allocated.
+- **cache_flush**: Determines whether the cache should be flushed. `1` indicates to flush and `0` to not flush.
+- **test_seconds**: Controls the test time for each scenario.
+- **lcore_dma**: Specifies the lcore/DMA mapping.
+- **lcore**: Specifies the lcore for CPU testing.
+- **eal_args**: Specifies the EAL arguments.
+
+.. Note::
+
+	The mapping of lcore to DMA must be one-to-one and cannot be duplicated.
+
+To specify a configuration file, use the "\-\-config" flag followed by the path to the file.
+
+To specify a result file, use the "\-\-result" flag followed by the path to the file. If you do not
+specify a result file, one will be generated with the same name as the configuration file, with the
+addition of "_result.csv" at the end.
+
+
+Running the Application
+-----------------------
+
+Typical command-line invocation to execute the application:
+
+.. code-block:: console
+
+   dpdk-test-dma-perf --config=./config_dma.ini --result=./res_dma.csv
+
+Where `config_dma.ini` is the configuration file, and `res_dma.csv` will be the generated result
+file.
+
+After the tests, you can find the results in the `res_dma.csv` file.
+
+Limitations
+-----------
+
+Currently, this tool only supports memory copy performance tests. Additional enhancements are
+possible in the future to support more types of tests for DMA devices and CPUs.
diff --git a/doc/guides/tools/index.rst b/doc/guides/tools/index.rst
index 6f84fc31ff..857572da96 100644
--- a/doc/guides/tools/index.rst
+++ b/doc/guides/tools/index.rst
@@ -23,3 +23,4 @@ DPDK Tools User Guides
     testregex
     testmldev
     dts
+    dmaperf
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
  2023-06-26  5:41       ` Anoob Joseph
@ 2023-06-26 10:02         ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-26 10:02 UTC (permalink / raw)
  To: Anoob Joseph, thomas, Richardson, Bruce, mb, Xia, Chenbo,
	Amit Prakash Shukla, huangdengdui, Laatz, Kevin, fengchengwen,
	Jerin Jacob Kollanukkaran
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He,
	Xingguang, Ling, WeiX

Hi Anoob,

Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Monday, June 26, 2023 1:42 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> Prakash Shukla <amitprakashs@marvell.com>; huangdengdui@huawei.com;
> Laatz, Kevin <kevin.laatz@intel.com>; fengchengwen@huawei.com; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Ling, WeiX <weix.ling@intel.com>
> Subject: RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf application
> 
> Hi Cheng,
> 
> Please see inline.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Sent: Saturday, June 24, 2023 5:23 PM
> > To: Anoob Joseph <anoobj@marvell.com>; thomas@monjalon.net;
> > Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>; Amit
> > Prakash Shukla <amitprakashs@marvell.com>;
> huangdengdui@huawei.com;
> > Laatz, Kevin <kevin.laatz@intel.com>; fengchengwen@huawei.com; Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>; Ling,
> > WeiX <weix.ling@intel.com>
> > Subject: RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi Anoob,
> >
> > Replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Anoob Joseph <anoobj@marvell.com>
> > > Sent: Friday, June 23, 2023 2:53 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> Amit
> > > Prakash Shukla <amitprakashs@marvell.com>;
> > huangdengdui@huawei.com;
> > > Laatz, Kevin <kevin.laatz@intel.com>; fengchengwen@huawei.com; Jerin
> > > Jacob Kollanukkaran <jerinj@marvell.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> > YuanX
> > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> > Ling,
> > > WeiX <weix.ling@intel.com>
> > > Subject: RE: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > Hi Cheng,
> > >
> > > Thanks for the new version. Please see inline.
> > >
> > > Thanks,
> > > Anoob
> > >
> > > > -----Original Message-----
> > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Sent: Tuesday, June 20, 2023 12:24 PM
> > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > mb@smartsharesystems.com; chenbo.xia@intel.com; Amit Prakash
> > Shukla
> > > > <amitprakashs@marvell.com>; Anoob Joseph <anoobj@marvell.com>;
> > > > huangdengdui@huawei.com; kevin.laatz@intel.com;
> > > > fengchengwen@huawei.com; Jerin Jacob Kollanukkaran
> > > > <jerinj@marvell.com>
> > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > xingguang.he@intel.com;
> > > > weix.ling@intel.com; Cheng Jiang <cheng1.jiang@intel.com>
> > > > Subject: [EXT] [PATCH v8] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > External Email
> > > >
> > > > ------------------------------------------------------------------
> > > > --
> > > > -- There are many high-performance DMA devices supported in DPDK
> > > > now,
> > > and
> > > > these DMA devices can also be integrated into other modules of
> > > > DPDK as accelerators, such as Vhost. Before integrating DMA into
> > > > applications, developers need to know the performance of these DMA
> > > > devices in various scenarios and the performance of CPUs in the
> > > > same scenario, such as different buffer lengths. Only in this way
> > > > can we know the target performance of the application accelerated
> > > > by using them. This patch introduces a high-performance testing
> > > > tool, which supports comparing the performance of CPU and DMA in
> > > > different scenarios automatically with a pre- set config file.
> > > > Memory Copy performance test
> > > are supported for now.
> > > >
> > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > ---
> > > > v8:
> > > >   fixed string copy issue in parse_lcore();
> > > >   improved some data display format;
> > > >   added doc in doc/guides/tools;
> > > >   updated release notes;
> > > >
> > > > v7:
> > > >   fixed some strcpy issues;
> > > >   removed cache setup in calling rte_pktmbuf_pool_create();
> > > >   fixed some typos;
> > > >   added some memory free and null set operations;
> > > >   improved result calculation;
> > > > v6:
> > > >   improved code based on Anoob's comments;
> > > >   fixed some code structure issues;
> > > > v5:
> > > >   fixed some LONG_LINE warnings;
> > > > v4:
> > > >   fixed inaccuracy of the memory footprint display;
> > > > v3:
> > > >   fixed some typos;
> > > > v2:
> > > >   added lcore/dmadev designation;
> > > >   added error case process;
> > > >   removed worker_threads parameter from config.ini;
> > > >   improved the logs;
> > > >   improved config file;
> > > >
> > > >  app/meson.build                        |   1 +
> > > >  app/test-dma-perf/benchmark.c          | 498 +++++++++++++++++++++
> > > >  app/test-dma-perf/config.ini           |  61 +++
> > > >  app/test-dma-perf/main.c               | 594 +++++++++++++++++++++++++
> > > >  app/test-dma-perf/main.h               |  69 +++
> > > >  app/test-dma-perf/meson.build          |  17 +
> > > >  doc/guides/rel_notes/release_23_07.rst |   6 +
> > > >  doc/guides/tools/dmaperf.rst           | 103 +++++
> > > >  doc/guides/tools/index.rst             |   1 +
> > > >  9 files changed, 1350 insertions(+)  create mode 100644
> > > > app/test-dma-perf/benchmark.c  create mode 100644
> > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > > mode
> > > > 100644 app/test-dma-perf/meson.build  create mode 100644
> > > > doc/guides/tools/dmaperf.rst
> > > >
> > >
> 
> <snip>
> 
> > >
> > > > +			fprintf(stderr, "Error: Fail to find DMA %s.\n",
> > > > dma_name);
> > > > +			goto end;
> > > > +		}
> > > > +
> > > > +		ldm->dma_ids[i] = dev_id;
> > > > +		configure_dmadev_queue(dev_id, ring_size);
> > > > +		++nb_dmadevs;
> > > > +	}
> > > > +
> > > > +end:
> > > > +	if (nb_dmadevs < nb_workers) {
> > > > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> > > > nb_dmadevs, nb_workers);
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static inline void
> > > > +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
> > > > +			volatile struct worker_info *worker_info) {
> > > > +	int ret;
> > > > +	uint16_t nr_cpl;
> > > > +
> > > > +	ret = rte_dma_submit(dev_id, 0);
> > > > +	if (ret < 0) {
> > > > +		rte_dma_stop(dev_id);
> > > > +		rte_dma_close(dev_id);
> > > > +		rte_exit(EXIT_FAILURE, "Error with dma submit.\n");
> > > > +	}
> > > > +
> > > > +	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,
> > > > NULL);
> > > > +	*async_cnt -= nr_cpl;
> > > > +	worker_info->total_cpl += nr_cpl; }
> > > > +
> > > > +static inline int
> > > > +do_dma_mem_copy(void *p)
> > >
> > > [Anoob] Just curious, why not pass struct lcore_params *para itself?
> > > Is it because the pointer is volatile? If yes, then we can take an
> > > AI to split the struct into volatile and non-volatile parts.
> >
> > [Cheng] The reason I did it this way is because I want to launch this
> > function on another core by spawning a new thread, and
> > rte_eal_remote_launch() takes a void * as the parameter. That's why I
> > passed void *p.  Your suggestion to split the struct into volatile and
> > non-volatile parts is quite reasonable. I am thinking about the best way to
> implement it. Thanks.
> 
> [Anoob] Instead of passing the address of index variable as void *, you can
> easily send lcore_params pointer, right?
> 

[Cheng] Yes, you are right. I can pass the lcore_params pointer. And I'll fix it in the next version. The new lcore_params will be non-volatile with volatile worker_info in it. This is more reasonable.

> >
> > >
> > > > +{
> > > > +	const uint16_t *para_idx = (uint16_t *)p;
> > > > +	volatile struct lcore_params *para = lcores_p[*para_idx].v_ptr;
> > > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > > +	const uint16_t dev_id = para->dev_id;
> > > > +	const uint32_t nr_buf = para->nr_buf;
> > > > +	const uint16_t kick_batch = para->kick_batch;
> > > > +	const uint32_t buf_size = para->buf_size;
> > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > +	uint16_t nr_cpl;
> > > > +	uint64_t async_cnt = 0;
> > > > +	uint32_t i;
> > > > +	uint32_t poll_cnt = 0;
> > > > +	int ret;
> > > > +
> > > > +	worker_info->stop_flag = false;
> > > > +	worker_info->ready_flag = true;
> > > > +
> > > > +	while (!worker_info->start_flag)
> > > > +		;
> > > > +
> > > > +	while (1) {
> > > > +		for (i = 0; i < nr_buf; i++) {
> > > > +dma_copy:
> > > > +			ret = rte_dma_copy(dev_id, 0,
> > > > rte_pktmbuf_iova(srcs[i]),
> > > > +				rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> > >
> > > [Anoob] Do we need to use ' rte_mbuf_data_iova' here instead of
> > > 'rte_pktmbuf_iova'?
> >
> > [Cheng] yes rte_mbuf_data_iova is more appropriate, I'll fix it in the
> > next version. Thanks.
> >
> > >
> > > > +			if (unlikely(ret < 0)) {
> > > > +				if (ret == -ENOSPC) {
> > > > +					do_dma_submit_and_poll(dev_id,
> > > > &async_cnt, worker_info);
> > > > +					goto dma_copy;
> > > > +				} else {
> > > > +					/* Error exit */
> > > > +					rte_dma_stop(dev_id);
> > >
> > > [Anoob] Missing rte_dma_close() here. Also, may be introduce a
> > > static void function so that rte_exit call etc won't be part of the fastpath
> loop.
> > >
> > > May be something like below and you can call it from here and
> > > "do_dma_submit_and_poll".
> > >
> > > static void
> > > error_exit(int dev_id)
> > > {
> > > 	/* Error exit */
> > > 	rte_dma_stop(dev_id);
> > > 	rte_dma_close(dev_id);
> > > 	rte_exit(EXIT_FAILURE, "DMA enqueue failed\n"); }
> > >
> >
> > [Cheng] I'm not so sure here. rte_dma_close() is called in the
> > rte_exit(). Do we still call it explicitly before rte_exit()?
> 
> [Anoob] In ' do_dma_submit_and_poll', there is rte_dma_close() before
> rte_exit(). I'm fine either way as long is it is consistent. Said that, I think it is
> better to call close() from app, rather than relying on rte_exit.
> 

[Cheng] sure, it makes sense to me that app should call rte_dma_close(), I'll fix it in the next version. Thanks.

> <snip>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (7 preceding siblings ...)
  2023-06-26  9:41 ` [PATCH v9] " Cheng Jiang
@ 2023-06-28  1:20 ` Cheng Jiang
  2023-06-28  4:42   ` [EXT] " Anoob Joseph
                     ` (3 more replies)
  2023-06-29 13:14 ` [PATCH v11] " Cheng Jiang
  9 siblings, 4 replies; 53+ messages in thread
From: Cheng Jiang @ 2023-06-28  1:20 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, fengchengwen, jerinj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	weix.ling, Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v10:
  rebased code from 23.07-rc2;
v9:
  improved error handling;
  improved lcore_params structure;
  improved mbuf api calling;
  improved exit process;
  fixed some typos;
  added scenario summary data display;
  removed unnecessary include;
v8:
  fixed string copy issue in parse_lcore();
  improved some data display format;
  added doc in doc/guides/tools;
  updated release notes;
v7:
  fixed some strcpy issues;
  removed cache setup in calling rte_pktmbuf_pool_create();
  fixed some typos;
  added some memory free and null set operations;
  improved result calculation;
v6:
  improved code based on Anoob's comments;
  fixed some code structure issues;
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build                        |   1 +
 app/test-dma-perf/benchmark.c          | 508 ++++++++++++++++++++
 app/test-dma-perf/config.ini           |  61 +++
 app/test-dma-perf/main.c               | 616 +++++++++++++++++++++++++
 app/test-dma-perf/main.h               |  64 +++
 app/test-dma-perf/meson.build          |  17 +
 doc/guides/rel_notes/release_23_07.rst |   6 +
 doc/guides/tools/dmaperf.rst           | 103 +++++
 doc/guides/tools/index.rst             |   1 +
 9 files changed, 1377 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build
 create mode 100644 doc/guides/tools/dmaperf.rst

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..0601e0d171
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,508 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+#define POLL_MAX 1000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+#define CSV_TOTAL_LINE_FMT "Scenario %u Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	volatile struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static struct lcore_params *lcores[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	float ops;
+
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	ops = (float)total_cnt / test_secs;
+	*mops = ops / (1000 * 1000);
+	*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
+			float memory, float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
+				lcore_id, dma_name, ring_size, kick_batch);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
+	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__rte_unused struct rte_mbuf **array,
+		__rte_unused uint32_t buf_size,
+		__rte_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	if (rte_dma_info_get(dev_id, &info) != 0)
+		rte_exit(EXIT_FAILURE, "Error with getting device info.\n");
+
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id < 0) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static void
+error_exit(int dev_id)
+{
+	rte_dma_stop(dev_id);
+	rte_dma_close(dev_id);
+	rte_exit(EXIT_FAILURE, "DMA error\n");
+}
+
+static inline void
+do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
+			volatile struct worker_info *worker_info)
+{
+	int ret;
+	uint16_t nr_cpl;
+
+	ret = rte_dma_submit(dev_id, 0);
+	if (ret < 0)
+		error_exit(dev_id);
+
+	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+	*async_cnt -= nr_cpl;
+	worker_info->total_cpl += nr_cpl;
+}
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
+				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint32_t nr_buf = para->nr_buf;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src mbufs failed.\n");
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory = 0;
+	uint32_t avg_cycles = 0;
+	uint32_t avg_cycles_total;
+	float mops, mops_total;
+	float bandwidth, bandwidth_total;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush == 1) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (lcores[i] == NULL) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			lcores[i]->dma_name = ldm->dma_names[i];
+			lcores[i]->dev_id = ldm->dma_ids[i];
+			lcores[i]->kick_batch = kick_batch;
+		}
+		lcores[i]->worker_id = i;
+		lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		lcores[i]->buf_size = buf_size;
+		lcores[i]->test_secs = test_secs;
+		lcores[i]->srcs = srcs + offset;
+		lcores[i]->dsts = dsts + offset;
+		lcores[i]->scenario_id = cfg->scenario_id;
+		lcores[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (lcores[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl -
+						lcores[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	mops_total = 0;
+	bandwidth_total = 0;
+	avg_cycles_total = 0;
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			lcores[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, lcores[i]->lcore_id,
+					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
+					avg_cycles, buf_size, nr_buf / nb_workers, memory,
+					bandwidth, mops, is_dma);
+		mops_total += mops;
+		bandwidth_total += bandwidth;
+		avg_cycles_total += avg_cycles;
+	}
+	printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n", bandwidth_total, mops_total);
+	snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN, CSV_TOTAL_LINE_FMT,
+			cfg->scenario_id, nr_buf, memory * nb_workers,
+			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
+
+out:
+	/* free mbufs used in the test */
+	if (srcs != NULL)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts != NULL)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	/* free the points for the mbufs */
+	rte_free(srcs);
+	srcs = NULL;
+	rte_free(dsts);
+	dsts = NULL;
+
+	rte_mempool_free(src_pool);
+	src_pool = NULL;
+
+	rte_mempool_free(dst_pool);
+	dst_pool = NULL;
+
+	/* free the worker parameters */
+	for (i = 0; i < nb_workers; i++) {
+		rte_free(lcores[i]);
+		lcores[i] = NULL;
+	}
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..b550f4b23f
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
+;  64 and 4096.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first,last,increment,ADD|MUL.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+; We have to ensure a 1:1 mapping between the core and DMA device.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..de37120df6
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,DMA ring size,kick batch size,buffer size(B),number of buffers,memory(MB),average cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Test Environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%.3lf Ghz",
+			rte_get_timer_hz() / 1000000000.0);
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else {
+			printf("No proper operation for variable entry.\n");
+			break;
+		}
+	}
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	uint16_t len;
+	char *input;
+	struct lcore_dma_map_t *lcore_dma_map;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	len = strlen(value);
+	input = (char *)malloc((len + 1) * sizeof(char));
+	strlcpy(input, value, len + 1);
+	lcore_dma_map = &(test_case->lcore_dma_map);
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input, *addrs;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	input = strndup(value, strlen(value) + 1);
+	addrs = input;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {
+			fprintf(stderr, "Illegal DMA address\n");
+			ret = -1;
+			break;
+		}
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			fprintf(stderr, "lcores count error\n");
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1],
+				RTE_DEV_NAME_MAX_LEN);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+	int ret;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	ret = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (ret != 1 && ret != 4)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+
+	if (ret == 4) {
+		args_nr = 4;
+		entry->last = (uint32_t)atoi(args[1]);
+		entry->incr = (uint32_t)atoi(args[2]);
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			args_nr = -1;
+			printf("Invalid op %s.\n", args[3]);
+		}
+
+	} else {
+		args_nr = 1;
+		entry->op = OP_NONE;
+		entry->last = 0;
+		entry->incr = 0;
+	}
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (case_type == NULL) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Wrong test case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+		nb_vp = 0;
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr == 4)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr == 4)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr == 4)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr == 4)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Case %d error, each section can only have a single variable parameter.\n",
+					i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(uint8_t)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, MAX_EAL_PARAM_LEN);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strlcpy(new_argv[new_argc++], tokens[i], MAX_EAL_PARAM_LEN);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		char *token = strtok(basename(rst_path), ".");
+		if (token == NULL) {
+			printf("Config file error.\n");
+			return -1;
+		}
+		strcat(token, "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	printf("Running cases...\n");
+	for (i = 0; i < case_nb; i++) {
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No valid test type in test case %d.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			output_env_info();
+
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..12bc3f4e3f
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint8_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
diff --git a/doc/guides/rel_notes/release_23_07.rst b/doc/guides/rel_notes/release_23_07.rst
index 4459144140..796cc5517d 100644
--- a/doc/guides/rel_notes/release_23_07.rst
+++ b/doc/guides/rel_notes/release_23_07.rst
@@ -200,6 +200,12 @@ New Features

   Enhanced the GRO library to support TCP packets over IPv6 network.

+* **Added DMA device performance test application.**
+
+  Added an new application to test the performance of DMA device and CPU.
+
+  See the :doc:`../tools/dmaperf` for more details.
+

 Removed Items
 -------------
diff --git a/doc/guides/tools/dmaperf.rst b/doc/guides/tools/dmaperf.rst
new file mode 100644
index 0000000000..c5f8a9406f
--- /dev/null
+++ b/doc/guides/tools/dmaperf.rst
@@ -0,0 +1,103 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2023 Intel Corporation.
+
+dpdk-test-dma-perf Application
+==============================
+
+The ``dpdk-test-dma-perf`` tool is a Data Plane Development Kit (DPDK) application that enables
+testing the performance of DMA (Direct Memory Access) devices available within DPDK. It provides a
+test framework to assess the performance of CPU and DMA devices under various scenarios, such as
+varying buffer lengths. Doing so provides insight into the potential performance when using these
+DMA devices for acceleration in DPDK applications. It supports memory copy performance tests for
+now, comparing the performance of CPU and DMA automatically in various conditions with the help of a
+pre-set configuration file.
+
+
+Configuration
+-------------
+This application uses inherent DPDK EAL command-line options as well as custom command-line options
+in the application. An example configuration file for the application is provided and gives the
+meanings for each parameter.
+
+Here is an extracted sample from the configuration file (the complete sample can be found in the
+application source directory):
+
+.. code-block:: ini
+
+   [case1]
+   type=DMA_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   dma_ring_size=1024
+   kick_batch=32
+   src_numa_node=0
+   dst_numa_node=0
+   cache_flush=0
+   test_seconds=2
+   lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+   eal_args=--in-memory --file-prefix=test
+
+   [case2]
+   type=CPU_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   src_numa_node=0
+   dst_numa_node=1
+   cache_flush=0
+   test_seconds=2
+   lcore = 3, 4
+   eal_args=--in-memory --no-pci
+
+The configuration file is divided into multiple sections, each section represents a test case.
+The four variables mem_size, buf_size, dma_ring_size, and kick_batch can vary in each test case.
+The format for this is ``variable=first,last,increment,ADD\|MUL``. This means that the first value
+of the variable is 'first', the last value is 'last', 'increment' is the step size, and ADD|MUL
+indicates whether the change is by addition or multiplication. Each case can only have one variable
+change, and each change will generate a scenario, so each case can have multiple scenarios.
+
+Parameter Definitions
+---------------------
+
+- **type**: The type of the test. Currently supported types are `DMA_MEM_COPY` and `CPU_MEM_COPY`.
+- **mem_size**: The size of the memory footprint.
+- **buf_size**: The memory size of a single operation.
+- **dma_ring_size**: The DMA ring buffer size. Must be a power of two, and between 64 and 4096.
+- **kick_batch**: The DMA operation batch size, should be greater than 1 normally.
+- **src_numa_node**: Controls the NUMA node where the source memory is allocated.
+- **dst_numa_node**: Controls the NUMA node where the destination memory is allocated.
+- **cache_flush**: Determines whether the cache should be flushed. `1` indicates to flush and `0` to not flush.
+- **test_seconds**: Controls the test time for each scenario.
+- **lcore_dma**: Specifies the lcore/DMA mapping.
+- **lcore**: Specifies the lcore for CPU testing.
+- **eal_args**: Specifies the EAL arguments.
+
+.. Note::
+
+	The mapping of lcore to DMA must be one-to-one and cannot be duplicated.
+
+To specify a configuration file, use the "\-\-config" flag followed by the path to the file.
+
+To specify a result file, use the "\-\-result" flag followed by the path to the file. If you do not
+specify a result file, one will be generated with the same name as the configuration file, with the
+addition of "_result.csv" at the end.
+
+
+Running the Application
+-----------------------
+
+Typical command-line invocation to execute the application:
+
+.. code-block:: console
+
+   dpdk-test-dma-perf --config=./config_dma.ini --result=./res_dma.csv
+
+Where `config_dma.ini` is the configuration file, and `res_dma.csv` will be the generated result
+file.
+
+After the tests, you can find the results in the `res_dma.csv` file.
+
+Limitations
+-----------
+
+Currently, this tool only supports memory copy performance tests. Additional enhancements are
+possible in the future to support more types of tests for DMA devices and CPUs.
diff --git a/doc/guides/tools/index.rst b/doc/guides/tools/index.rst
index 6f84fc31ff..857572da96 100644
--- a/doc/guides/tools/index.rst
+++ b/doc/guides/tools/index.rst
@@ -23,3 +23,4 @@ DPDK Tools User Guides
     testregex
     testmldev
     dts
+    dmaperf
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [EXT] [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
@ 2023-06-28  4:42   ` Anoob Joseph
  2023-06-28  6:06   ` Ling, WeiX
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 53+ messages in thread
From: Anoob Joseph @ 2023-06-28  4:42 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia,
	Amit Prakash Shukla, huangdengdui, kevin.laatz, fengchengwen,
	Jerin Jacob Kollanukkaran
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he, weix.ling

> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v10:
>   rebased code from 23.07-rc2;
> v9:
>   improved error handling;
>   improved lcore_params structure;
>   improved mbuf api calling;
>   improved exit process;
>   fixed some typos;
>   added scenario summary data display;
>   removed unnecessary include;
> v8:
>   fixed string copy issue in parse_lcore();
>   improved some data display format;
>   added doc in doc/guides/tools;
>   updated release notes;
> v7:
>   fixed some strcpy issues;
>   removed cache setup in calling rte_pktmbuf_pool_create();
>   fixed some typos;
>   added some memory free and null set operations;
>   improved result calculation;
> v6:
>   improved code based on Anoob's comments;
>   fixed some code structure issues;
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build                        |   1 +
>  app/test-dma-perf/benchmark.c          | 508 ++++++++++++++++++++
>  app/test-dma-perf/config.ini           |  61 +++
>  app/test-dma-perf/main.c               | 616 +++++++++++++++++++++++++
>  app/test-dma-perf/main.h               |  64 +++
>  app/test-dma-perf/meson.build          |  17 +
>  doc/guides/rel_notes/release_23_07.rst |   6 +
>  doc/guides/tools/dmaperf.rst           | 103 +++++
>  doc/guides/tools/index.rst             |   1 +
>  9 files changed, 1377 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build  create mode 100644
> doc/guides/tools/dmaperf.rst
> 

Thanks Cheng for addressing all the comments.

Acked-by: Anoob Joseph <anoobj@marvell.com>
Tested-by: Anoob Joseph <anoobj@marvell.com>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
  2023-06-28  4:42   ` [EXT] " Anoob Joseph
@ 2023-06-28  6:06   ` Ling, WeiX
  2023-06-29  9:08   ` Thomas Monjalon
  2023-06-29  9:38   ` Thomas Monjalon
  3 siblings, 0 replies; 53+ messages in thread
From: Ling, WeiX @ 2023-06-28  6:06 UTC (permalink / raw)
  To: Jiang, Cheng1, thomas, Richardson, Bruce, mb, Xia, Chenbo,
	amitprakashs, anoobj, huangdengdui, Laatz, Kevin, fengchengwen,
	jerinj
  Cc: dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Wednesday, June 28, 2023 9:21 AM
> To: thomas@monjalon.net; Richardson, Bruce
> <bruce.richardson@intel.com>; mb@smartsharesystems.com; Xia, Chenbo
> <chenbo.xia@intel.com>; amitprakashs@marvell.com; anoobj@marvell.com;
> huangdengdui@huawei.com; Laatz, Kevin <kevin.laatz@intel.com>;
> fengchengwen@huawei.com; jerinj@marvell.com
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Ling, WeiX <weix.ling@intel.com>; Jiang, Cheng1 <cheng1.jiang@intel.com>
> Subject: [PATCH v10] app/dma-perf: introduce dma-perf application
> 
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v10:
>   rebased code from 23.07-rc2;
> v9:
>   improved error handling;
>   improved lcore_params structure;
>   improved mbuf api calling;
>   improved exit process;
>   fixed some typos;
>   added scenario summary data display;
>   removed unnecessary include;
> v8:
>   fixed string copy issue in parse_lcore();
>   improved some data display format;
>   added doc in doc/guides/tools;
>   updated release notes;
> v7:
>   fixed some strcpy issues;
>   removed cache setup in calling rte_pktmbuf_pool_create();
>   fixed some typos;
>   added some memory free and null set operations;
>   improved result calculation;
> v6:
>   improved code based on Anoob's comments;
>   fixed some code structure issues;
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build                        |   1 +
>  app/test-dma-perf/benchmark.c          | 508 ++++++++++++++++++++
>  app/test-dma-perf/config.ini           |  61 +++
>  app/test-dma-perf/main.c               | 616 +++++++++++++++++++++++++
>  app/test-dma-perf/main.h               |  64 +++
>  app/test-dma-perf/meson.build          |  17 +
>  doc/guides/rel_notes/release_23_07.rst |   6 +
>  doc/guides/tools/dmaperf.rst           | 103 +++++
>  doc/guides/tools/index.rst             |   1 +
>  9 files changed, 1377 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build  create mode 100644
> doc/guides/tools/dmaperf.rst
> 

Tested-by: Wei Ling <weix.ling@intel.com>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
  2023-06-28  4:42   ` [EXT] " Anoob Joseph
  2023-06-28  6:06   ` Ling, WeiX
@ 2023-06-29  9:08   ` Thomas Monjalon
  2023-06-29 12:50     ` Jiang, Cheng1
  2023-06-29  9:38   ` Thomas Monjalon
  3 siblings, 1 reply; 53+ messages in thread
From: Thomas Monjalon @ 2023-06-29  9:08 UTC (permalink / raw)
  To: jiayu.hu, yuanx.wang, Cheng Jiang
  Cc: bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, fengchengwen, jerinj, dev, xuan.ding,
	wenwux.ma, xingguang.he, weix.ling

28/06/2023 03:20, Cheng Jiang:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>

Who is going to be the maintainer for this new app?
An entry in the file MAINTAINERS would be perfect.




^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
                     ` (2 preceding siblings ...)
  2023-06-29  9:08   ` Thomas Monjalon
@ 2023-06-29  9:38   ` Thomas Monjalon
  2023-06-29 12:51     ` Jiang, Cheng1
  3 siblings, 1 reply; 53+ messages in thread
From: Thomas Monjalon @ 2023-06-29  9:38 UTC (permalink / raw)
  To: Cheng Jiang
  Cc: bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, fengchengwen, jerinj, dev, jiayu.hu,
	xuan.ding, wenwux.ma, yuanx.wang, xingguang.he, weix.ling

28/06/2023 03:20, Cheng Jiang:
> --- a/doc/guides/tools/index.rst
> +++ b/doc/guides/tools/index.rst
> @@ -23,3 +23,4 @@ DPDK Tools User Guides
>      testregex
>      testmldev
>      dts
> +    dmaperf
 
Would be better to have upper in the list (DTS should be last).
I suggest before flow-perf as DMA is more low-level.



^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-29  9:08   ` Thomas Monjalon
@ 2023-06-29 12:50     ` Jiang, Cheng1
  2023-06-29 13:19       ` Thomas Monjalon
  0 siblings, 1 reply; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-29 12:50 UTC (permalink / raw)
  To: Thomas Monjalon, Hu, Jiayu, Wang, YuanX
  Cc: Richardson, Bruce, mb, Xia, Chenbo, amitprakashs, anoobj,
	huangdengdui, Laatz, Kevin, fengchengwen, jerinj, dev, Ding,
	Xuan, Ma, WenwuX, He, Xingguang, Ling, WeiX

Hi Thomas,

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Thursday, June 29, 2023 5:09 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; Wang, YuanX <yuanx.wang@intel.com>;
> Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> amitprakashs@marvell.com; anoobj@marvell.com;
> huangdengdui@huawei.com; Laatz, Kevin <kevin.laatz@intel.com>;
> fengchengwen@huawei.com; jerinj@marvell.com; dev@dpdk.org; Ding,
> Xuan <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; He,
> Xingguang <xingguang.he@intel.com>; Ling, WeiX <weix.ling@intel.com>
> Subject: Re: [PATCH v10] app/dma-perf: introduce dma-perf application
> 
> 28/06/2023 03:20, Cheng Jiang:
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre-set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> 
> Who is going to be the maintainer for this new app?
> An entry in the file MAINTAINERS would be perfect.
> 

I can be the maintainer for this new app. Okay, I will add an entry to the MAINTAINERS file.
Would you prefer I send a new version patch to add this entry, or send a separate patch?

Thanks,
Cheng

> 


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-29  9:38   ` Thomas Monjalon
@ 2023-06-29 12:51     ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-29 12:51 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Richardson, Bruce, mb, Xia, Chenbo, amitprakashs, anoobj,
	huangdengdui, Laatz, Kevin, fengchengwen, jerinj, dev, Hu, Jiayu,
	Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang, Ling, WeiX

Hi Thomas,

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Thursday, June 29, 2023 5:38 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> amitprakashs@marvell.com; anoobj@marvell.com;
> huangdengdui@huawei.com; Laatz, Kevin <kevin.laatz@intel.com>;
> fengchengwen@huawei.com; jerinj@marvell.com; dev@dpdk.org; Hu, Jiayu
> <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; He,
> Xingguang <xingguang.he@intel.com>; Ling, WeiX <weix.ling@intel.com>
> Subject: Re: [PATCH v10] app/dma-perf: introduce dma-perf application
> 
> 28/06/2023 03:20, Cheng Jiang:
> > --- a/doc/guides/tools/index.rst
> > +++ b/doc/guides/tools/index.rst
> > @@ -23,3 +23,4 @@ DPDK Tools User Guides
> >      testregex
> >      testmldev
> >      dts
> > +    dmaperf
> 
> Would be better to have upper in the list (DTS should be last).
> I suggest before flow-perf as DMA is more low-level.
> 
Sure, I'll submit a new patch to fix it.

Thanks,
Cheng

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH v11] app/dma-perf: introduce dma-perf application
  2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
                   ` (8 preceding siblings ...)
  2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
@ 2023-06-29 13:14 ` Cheng Jiang
  2023-07-03  8:20   ` fengchengwen
  9 siblings, 1 reply; 53+ messages in thread
From: Cheng Jiang @ 2023-06-29 13:14 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, fengchengwen, jerinj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	weix.ling, Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.
This patch also updates the documentation and maintainer list for the
application.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Tested-by: Anoob Joseph <anoobj@marvell.com>
Tested-by: Wei Ling <weix.ling@intel.com>
---
v11:
  updated MAINTAINERS file;
  fixed typo in doc/guides/tools/dmaperf.rst;
  changed the dmaperf order in doc/guides/tools/index.rst;
v10:
  rebased code from 23.07-rc2;
v9:
  improved error handling;
  improved lcore_params structure;
  improved mbuf api calling;
  improved exit process;
  fixed some typos;
  added scenario summary data display;
  removed unnecessary include;
v8:
  fixed string copy issue in parse_lcore();
  improved some data display format;
  added doc in doc/guides/tools;
  updated release notes;
v7:
  fixed some strcpy issues;
  removed cache setup in calling rte_pktmbuf_pool_create();
  fixed some typos;
  added some memory free and null set operations;
  improved result calculation;
v6:
  improved code based on Anoob's comments;
  fixed some code structure issues;
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 MAINTAINERS                            |   5 +
 app/meson.build                        |   1 +
 app/test-dma-perf/benchmark.c          | 508 ++++++++++++++++++++
 app/test-dma-perf/config.ini           |  61 +++
 app/test-dma-perf/main.c               | 616 +++++++++++++++++++++++++
 app/test-dma-perf/main.h               |  64 +++
 app/test-dma-perf/meson.build          |  17 +
 doc/guides/rel_notes/release_23_07.rst |   6 +
 doc/guides/tools/dmaperf.rst           | 103 +++++
 doc/guides/tools/index.rst             |   1 +
 10 files changed, 1382 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build
 create mode 100644 doc/guides/tools/dmaperf.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index 545ed6dd7d..b99d318608 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1767,6 +1767,11 @@ T: git://dpdk.org/next/dpdk-next-net
 F: app/test-pmd/
 F: doc/guides/testpmd_app_ug/

+DMA device performance tool
+M: Cheng Jiang <cheng1.jiang@intel.com>
+F: app/test-dma-perf/
+F: doc/guides/tools/dmaperf.rst
+
 Flow performance tool
 M: Wisam Jaddo <wisamm@nvidia.com>
 F: app/test-flow-perf/
diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..0601e0d171
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,508 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+#define POLL_MAX 1000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+#define CSV_TOTAL_LINE_FMT "Scenario %u Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	volatile struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static struct lcore_params *lcores[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	float ops;
+
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	ops = (float)total_cnt / test_secs;
+	*mops = ops / (1000 * 1000);
+	*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
+			float memory, float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
+				lcore_id, dma_name, ring_size, kick_batch);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
+	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__rte_unused struct rte_mbuf **array,
+		__rte_unused uint32_t buf_size,
+		__rte_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	if (rte_dma_info_get(dev_id, &info) != 0)
+		rte_exit(EXIT_FAILURE, "Error with getting device info.\n");
+
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id < 0) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static void
+error_exit(int dev_id)
+{
+	rte_dma_stop(dev_id);
+	rte_dma_close(dev_id);
+	rte_exit(EXIT_FAILURE, "DMA error\n");
+}
+
+static inline void
+do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
+			volatile struct worker_info *worker_info)
+{
+	int ret;
+	uint16_t nr_cpl;
+
+	ret = rte_dma_submit(dev_id, 0);
+	if (ret < 0)
+		error_exit(dev_id);
+
+	nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+	*async_cnt -= nr_cpl;
+	worker_info->total_cpl += nr_cpl;
+}
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint16_t dev_id = para->dev_id;
+	const uint32_t nr_buf = para->nr_buf;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint16_t nr_cpl;
+	uint64_t async_cnt = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+	int ret;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
+				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	const uint32_t nr_buf = para->nr_buf;
+	const uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf,
+			0,
+			0,
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src mbufs failed.\n");
+		return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory = 0;
+	uint32_t avg_cycles = 0;
+	uint32_t avg_cycles_total;
+	float mops, mops_total;
+	float bandwidth, bandwidth_total;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush == 1) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (lcores[i] == NULL) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			lcores[i]->dma_name = ldm->dma_names[i];
+			lcores[i]->dev_id = ldm->dma_ids[i];
+			lcores[i]->kick_batch = kick_batch;
+		}
+		lcores[i]->worker_id = i;
+		lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		lcores[i]->buf_size = buf_size;
+		lcores[i]->test_secs = test_secs;
+		lcores[i]->srcs = srcs + offset;
+		lcores[i]->dsts = dsts + offset;
+		lcores[i]->scenario_id = cfg->scenario_id;
+		lcores[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (lcores[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl -
+						lcores[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		lcores[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	mops_total = 0;
+	bandwidth_total = 0;
+	avg_cycles_total = 0;
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			lcores[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, lcores[i]->lcore_id,
+					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
+					avg_cycles, buf_size, nr_buf / nb_workers, memory,
+					bandwidth, mops, is_dma);
+		mops_total += mops;
+		bandwidth_total += bandwidth;
+		avg_cycles_total += avg_cycles;
+	}
+	printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n", bandwidth_total, mops_total);
+	snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN, CSV_TOTAL_LINE_FMT,
+			cfg->scenario_id, nr_buf, memory * nb_workers,
+			avg_cycles_total / nb_workers, bandwidth_total, mops_total);
+
+out:
+	/* free mbufs used in the test */
+	if (srcs != NULL)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts != NULL)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	/* free the points for the mbufs */
+	rte_free(srcs);
+	srcs = NULL;
+	rte_free(dsts);
+	dsts = NULL;
+
+	rte_mempool_free(src_pool);
+	src_pool = NULL;
+
+	rte_mempool_free(dst_pool);
+	dst_pool = NULL;
+
+	/* free the worker parameters */
+	for (i = 0; i < nb_workers; i++) {
+		rte_free(lcores[i]);
+		lcores[i] = NULL;
+	}
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..b550f4b23f
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be must be a power of two, and between
+;  64 and 4096.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first,last,increment,ADD|MUL.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+; We have to ensure a 1:1 mapping between the core and DMA device.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..de37120df6
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,DMA ring size,kick batch size,buffer size(B),number of buffers,memory(MB),average cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Test Environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%.3lf Ghz",
+			rte_get_timer_hz() / 1000000000.0);
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else {
+			printf("No proper operation for variable entry.\n");
+			break;
+		}
+	}
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	uint16_t len;
+	char *input;
+	struct lcore_dma_map_t *lcore_dma_map;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	len = strlen(value);
+	input = (char *)malloc((len + 1) * sizeof(char));
+	strlcpy(input, value, len + 1);
+	lcore_dma_map = &(test_case->lcore_dma_map);
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input, *addrs;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	input = strndup(value, strlen(value) + 1);
+	addrs = input;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {
+			fprintf(stderr, "Illegal DMA address\n");
+			ret = -1;
+			break;
+		}
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			fprintf(stderr, "lcores count error\n");
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1],
+				RTE_DEV_NAME_MAX_LEN);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+	int ret;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	ret = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (ret != 1 && ret != 4)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+
+	if (ret == 4) {
+		args_nr = 4;
+		entry->last = (uint32_t)atoi(args[1]);
+		entry->incr = (uint32_t)atoi(args[2]);
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			args_nr = -1;
+			printf("Invalid op %s.\n", args[3]);
+		}
+
+	} else {
+		args_nr = 1;
+		entry->op = OP_NONE;
+		entry->last = 0;
+		entry->incr = 0;
+	}
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (case_type == NULL) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Wrong test case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+		nb_vp = 0;
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr == 4)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr == 4)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr == 4)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr == 4)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Case %d error, each section can only have a single variable parameter.\n",
+					i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(uint8_t)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, MAX_EAL_PARAM_LEN);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strlcpy(new_argv[new_argc++], tokens[i], MAX_EAL_PARAM_LEN);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		char *token = strtok(basename(rst_path), ".");
+		if (token == NULL) {
+			printf("Config file error.\n");
+			return -1;
+		}
+		strcat(token, "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	printf("Running cases...\n");
+	for (i = 0; i < case_nb; i++) {
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No valid test type in test case %d.\n\n", i + 1);
+			snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Invalid case %d\n", i + 1);
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+			output_csv(true);
+			fclose(fd);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			output_env_info();
+
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..12bc3f4e3f
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint8_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
diff --git a/doc/guides/rel_notes/release_23_07.rst b/doc/guides/rel_notes/release_23_07.rst
index 4459144140..796cc5517d 100644
--- a/doc/guides/rel_notes/release_23_07.rst
+++ b/doc/guides/rel_notes/release_23_07.rst
@@ -200,6 +200,12 @@ New Features

   Enhanced the GRO library to support TCP packets over IPv6 network.

+* **Added DMA device performance test application.**
+
+  Added an new application to test the performance of DMA device and CPU.
+
+  See the :doc:`../tools/dmaperf` for more details.
+

 Removed Items
 -------------
diff --git a/doc/guides/tools/dmaperf.rst b/doc/guides/tools/dmaperf.rst
new file mode 100644
index 0000000000..e892a0b508
--- /dev/null
+++ b/doc/guides/tools/dmaperf.rst
@@ -0,0 +1,103 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2023 Intel Corporation.
+
+dpdk-test-dma-perf Application
+==============================
+
+The ``dpdk-test-dma-perf`` tool is a Data Plane Development Kit (DPDK) application that enables
+testing the performance of DMA (Direct Memory Access) devices available within DPDK. It provides a
+test framework to assess the performance of CPU and DMA devices under various scenarios, such as
+varying buffer lengths. Doing so provides insight into the potential performance when using these
+DMA devices for acceleration in DPDK applications. It supports memory copy performance tests for
+now, comparing the performance of CPU and DMA automatically in various conditions with the help of a
+pre-set configuration file.
+
+
+Configuration
+-------------
+This application uses inherent DPDK EAL command-line options as well as custom command-line options
+in the application. An example configuration file for the application is provided and gives the
+meanings for each parameter.
+
+Here is an extracted sample from the configuration file (the complete sample can be found in the
+application source directory):
+
+.. code-block:: ini
+
+   [case1]
+   type=DMA_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   dma_ring_size=1024
+   kick_batch=32
+   src_numa_node=0
+   dst_numa_node=0
+   cache_flush=0
+   test_seconds=2
+   lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+   eal_args=--in-memory --file-prefix=test
+
+   [case2]
+   type=CPU_MEM_COPY
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   src_numa_node=0
+   dst_numa_node=1
+   cache_flush=0
+   test_seconds=2
+   lcore = 3, 4
+   eal_args=--in-memory --no-pci
+
+The configuration file is divided into multiple sections, each section represents a test case.
+The four variables mem_size, buf_size, dma_ring_size, and kick_batch can vary in each test case.
+The format for this is ``variable=first,last,increment,ADD|MUL``. This means that the first value
+of the variable is 'first', the last value is 'last', 'increment' is the step size, and ADD|MUL
+indicates whether the change is by addition or multiplication. Each case can only have one variable
+change, and each change will generate a scenario, so each case can have multiple scenarios.
+
+Parameter Definitions
+---------------------
+
+- **type**: The type of the test. Currently supported types are `DMA_MEM_COPY` and `CPU_MEM_COPY`.
+- **mem_size**: The size of the memory footprint.
+- **buf_size**: The memory size of a single operation.
+- **dma_ring_size**: The DMA ring buffer size. Must be a power of two, and between 64 and 4096.
+- **kick_batch**: The DMA operation batch size, should be greater than 1 normally.
+- **src_numa_node**: Controls the NUMA node where the source memory is allocated.
+- **dst_numa_node**: Controls the NUMA node where the destination memory is allocated.
+- **cache_flush**: Determines whether the cache should be flushed. `1` indicates to flush and `0` to not flush.
+- **test_seconds**: Controls the test time for each scenario.
+- **lcore_dma**: Specifies the lcore/DMA mapping.
+- **lcore**: Specifies the lcore for CPU testing.
+- **eal_args**: Specifies the EAL arguments.
+
+.. Note::
+
+	The mapping of lcore to DMA must be one-to-one and cannot be duplicated.
+
+To specify a configuration file, use the "\-\-config" flag followed by the path to the file.
+
+To specify a result file, use the "\-\-result" flag followed by the path to the file. If you do not
+specify a result file, one will be generated with the same name as the configuration file, with the
+addition of "_result.csv" at the end.
+
+
+Running the Application
+-----------------------
+
+Typical command-line invocation to execute the application:
+
+.. code-block:: console
+
+   dpdk-test-dma-perf --config=./config_dma.ini --result=./res_dma.csv
+
+Where `config_dma.ini` is the configuration file, and `res_dma.csv` will be the generated result
+file.
+
+After the tests, you can find the results in the `res_dma.csv` file.
+
+Limitations
+-----------
+
+Currently, this tool only supports memory copy performance tests. Additional enhancements are
+possible in the future to support more types of tests for DMA devices and CPUs.
diff --git a/doc/guides/tools/index.rst b/doc/guides/tools/index.rst
index 6f84fc31ff..f2afb1fcc5 100644
--- a/doc/guides/tools/index.rst
+++ b/doc/guides/tools/index.rst
@@ -14,6 +14,7 @@ DPDK Tools User Guides
     pmdinfo
     dumpcap
     pdump
+    dmaperf
     flow-perf
     securityperf
     testbbdev
--
2.40.1


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-29 12:50     ` Jiang, Cheng1
@ 2023-06-29 13:19       ` Thomas Monjalon
  2023-06-29 13:24         ` Jiang, Cheng1
  0 siblings, 1 reply; 53+ messages in thread
From: Thomas Monjalon @ 2023-06-29 13:19 UTC (permalink / raw)
  To: Hu, Jiayu, Wang, YuanX, Jiang, Cheng1
  Cc: Richardson, Bruce, mb, Xia, Chenbo, amitprakashs, anoobj,
	huangdengdui, Laatz, Kevin, fengchengwen, jerinj, dev, Ding,
	Xuan, Ma, WenwuX, He, Xingguang, Ling, WeiX

29/06/2023 14:50, Jiang, Cheng1:
> From: Thomas Monjalon <thomas@monjalon.net>
> > 28/06/2023 03:20, Cheng Jiang:
> > > There are many high-performance DMA devices supported in DPDK now,
> > and
> > > these DMA devices can also be integrated into other modules of DPDK as
> > > accelerators, such as Vhost. Before integrating DMA into applications,
> > > developers need to know the performance of these DMA devices in
> > > various scenarios and the performance of CPUs in the same scenario,
> > > such as different buffer lengths. Only in this way can we know the
> > > target performance of the application accelerated by using them. This
> > > patch introduces a high-performance testing tool, which supports
> > > comparing the performance of CPU and DMA in different scenarios
> > > automatically with a pre-set config file. Memory Copy performance test are
> > supported for now.
> > >
> > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > 
> > Who is going to be the maintainer for this new app?
> > An entry in the file MAINTAINERS would be perfect.
> > 
> 
> I can be the maintainer for this new app. Okay, I will add an entry to the MAINTAINERS file.
> Would you prefer I send a new version patch to add this entry, or send a separate patch?

It should be part of this patch.






^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v10] app/dma-perf: introduce dma-perf application
  2023-06-29 13:19       ` Thomas Monjalon
@ 2023-06-29 13:24         ` Jiang, Cheng1
  0 siblings, 0 replies; 53+ messages in thread
From: Jiang, Cheng1 @ 2023-06-29 13:24 UTC (permalink / raw)
  To: Thomas Monjalon, Hu, Jiayu, Wang, YuanX
  Cc: Richardson, Bruce, mb, Xia, Chenbo, amitprakashs, anoobj,
	huangdengdui, Laatz, Kevin, fengchengwen, jerinj, dev, Ding,
	Xuan, Ma, WenwuX, He, Xingguang, Ling, WeiX



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Thursday, June 29, 2023 9:19 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; Wang, YuanX <yuanx.wang@intel.com>;
> Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>;
> amitprakashs@marvell.com; anoobj@marvell.com;
> huangdengdui@huawei.com; Laatz, Kevin <kevin.laatz@intel.com>;
> fengchengwen@huawei.com; jerinj@marvell.com; dev@dpdk.org; Ding,
> Xuan <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; He,
> Xingguang <xingguang.he@intel.com>; Ling, WeiX <weix.ling@intel.com>
> Subject: Re: [PATCH v10] app/dma-perf: introduce dma-perf application
> 
> 29/06/2023 14:50, Jiang, Cheng1:
> > From: Thomas Monjalon <thomas@monjalon.net>
> > > 28/06/2023 03:20, Cheng Jiang:
> > > > There are many high-performance DMA devices supported in DPDK
> now,
> > > and
> > > > these DMA devices can also be integrated into other modules of
> > > > DPDK as accelerators, such as Vhost. Before integrating DMA into
> > > > applications, developers need to know the performance of these DMA
> > > > devices in various scenarios and the performance of CPUs in the
> > > > same scenario, such as different buffer lengths. Only in this way
> > > > can we know the target performance of the application accelerated
> > > > by using them. This patch introduces a high-performance testing
> > > > tool, which supports comparing the performance of CPU and DMA in
> > > > different scenarios automatically with a pre-set config file.
> > > > Memory Copy performance test are
> > > supported for now.
> > > >
> > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > >
> > > Who is going to be the maintainer for this new app?
> > > An entry in the file MAINTAINERS would be perfect.
> > >
> >
> > I can be the maintainer for this new app. Okay, I will add an entry to the
> MAINTAINERS file.
> > Would you prefer I send a new version patch to add this entry, or send a
> separate patch?
> 
> It should be part of this patch.
> 
> 
Sure, I'll submit a new patch. Thanks.

> 
> 


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH v11] app/dma-perf: introduce dma-perf application
  2023-06-29 13:14 ` [PATCH v11] " Cheng Jiang
@ 2023-07-03  8:20   ` fengchengwen
  2023-07-07  9:56     ` Thomas Monjalon
  0 siblings, 1 reply; 53+ messages in thread
From: fengchengwen @ 2023-07-03  8:20 UTC (permalink / raw)
  To: Cheng Jiang, thomas, bruce.richardson, mb, chenbo.xia,
	amitprakashs, anoobj, huangdengdui, kevin.laatz, jerinj
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he, weix.ling

Acked-by: Chengwen Feng <fengchengwen@huawei.com>

On 2023/6/29 21:14, Cheng Jiang wrote:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> This patch also updates the documentation and maintainer list for the
> application.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> Tested-by: Anoob Joseph <anoobj@marvell.com>
> Tested-by: Wei Ling <weix.ling@intel.com>

...

> 
> .
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH v11] app/dma-perf: introduce dma-perf application
  2023-07-03  8:20   ` fengchengwen
@ 2023-07-07  9:56     ` Thomas Monjalon
  0 siblings, 0 replies; 53+ messages in thread
From: Thomas Monjalon @ 2023-07-07  9:56 UTC (permalink / raw)
  To: Cheng Jiang
  Cc: bruce.richardson, mb, chenbo.xia, amitprakashs, anoobj,
	huangdengdui, kevin.laatz, jerinj, dev, jiayu.hu, xuan.ding,
	wenwux.ma, yuanx.wang, xingguang.he, weix.ling, fengchengwen

> > There are many high-performance DMA devices supported in DPDK now, and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in various
> > scenarios and the performance of CPUs in the same scenario, such as
> > different buffer lengths. Only in this way can we know the target
> > performance of the application accelerated by using them. This patch
> > introduces a high-performance testing tool, which supports comparing the
> > performance of CPU and DMA in different scenarios automatically with a
> > pre-set config file. Memory Copy performance test are supported for now.
> > This patch also updates the documentation and maintainer list for the
> > application.
> > 
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > Acked-by: Anoob Joseph <anoobj@marvell.com>
> > Tested-by: Anoob Joseph <anoobj@marvell.com>
> > Tested-by: Wei Ling <weix.ling@intel.com>
> 
> Acked-by: Chengwen Feng <fengchengwen@huawei.com>

Applied with few improvements in the documentation, thanks.



^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH v10] app/dma-perf: introduce dma-perf application
@ 2023-06-28 23:50 Zhang, Yuying
  0 siblings, 0 replies; 53+ messages in thread
From: Zhang, Yuying @ 2023-06-28 23:50 UTC (permalink / raw)
  To: dev, Jiang, Cheng1

[-- Attachment #1: Type: text/plain, Size: 68812 bytes --]

Hi Cheng,



LGTM.



> -----Original Message-----

> Date: Wed, 28 Jun 2023 01:20:34 +0000

> From: Cheng Jiang <cheng1.jiang@intel.com<mailto:cheng1.jiang@intel.com>>

> To: thomas@monjalon.net<mailto:thomas@monjalon.net>, bruce.richardson@intel.com<mailto:bruce.richardson@intel.com>,

>            mb@smartsharesystems.com<mailto:mb@smartsharesystems.com>, chenbo.xia@intel.com<mailto:chenbo.xia@intel.com>,

>            amitprakashs@marvell.com<mailto:amitprakashs@marvell.com>, anoobj@marvell.com<mailto:anoobj@marvell.com>,

> huangdengdui@huawei.com<mailto:huangdengdui@huawei.com>,

>            kevin.laatz@intel.com<mailto:kevin.laatz@intel.com>, fengchengwen@huawei.com<mailto:fengchengwen@huawei.com>, jerinj@marvell.com<mailto:jerinj@marvell.com>

> Cc: dev@dpdk.org<mailto:dev@dpdk.org>, jiayu.hu@intel.com<mailto:jiayu.hu@intel.com>, xuan.ding@intel.com<mailto:xuan.ding@intel.com>,

>            wenwux.ma@intel.com<mailto:wenwux.ma@intel.com>, yuanx.wang@intel.com<mailto:yuanx.wang@intel.com>, xingguang.he@intel.com<mailto:xingguang.he@intel.com>,

>            weix.ling@intel.com<mailto:weix.ling@intel.com>, Cheng Jiang <cheng1.jiang@intel.com<mailto:cheng1.jiang@intel.com>>

> Subject: [PATCH v10] app/dma-perf: introduce dma-perf application

> Message-ID: <20230628012034.49016-1-cheng1.jiang@intel.com<mailto:20230628012034.49016-1-cheng1.jiang@intel.com>>

> Content-Type: text/plain; charset=UTF-8

>

> There are many high-performance DMA devices supported in DPDK now, and

> these DMA devices can also be integrated into other modules of DPDK as

> accelerators, such as Vhost. Before integrating DMA into applications,

> developers need to know the performance of these DMA devices in

> various scenarios and the performance of CPUs in the same scenario,

> such as different buffer lengths. Only in this way can we know the

> target performance of the application accelerated by using them. This

> patch introduces a high-performance testing tool, which supports

> comparing the performance of CPU and DMA in different scenarios

> automatically with a pre-set config file. Memory Copy performance test are supported for now.

>

> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com<mailto:cheng1.jiang@intel.com>>

> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com<mailto:jiayu.hu@intel.com>>

> Signed-off-by: Yuan Wang <yuanx.wang@intel.com<mailto:yuanx.wang@intel.com>>

> Acked-by: Morten Br?rup <mb@smartsharesystems.com<mailto:mb@smartsharesystems.com>>

> Acked-by: Chenbo Xia <chenbo.xia@intel.com<mailto:chenbo.xia@intel.com>>



Acked-by: Yuying Zhang <yuying.zhang@intel.com<mailto:yuying.zhang@intel.com>>



> ---

> v10:

>   rebased code from 23.07-rc2;

> v9:

>   improved error handling;

>   improved lcore_params structure;

>   improved mbuf api calling;

>   improved exit process;

>   fixed some typos;

>   added scenario summary data display;

>   removed unnecessary include;

> v8:

>   fixed string copy issue in parse_lcore();

>   improved some data display format;

>   added doc in doc/guides/tools;

>   updated release notes;

> v7:

>   fixed some strcpy issues;

>   removed cache setup in calling rte_pktmbuf_pool_create();

>   fixed some typos;

>   added some memory free and null set operations;

>   improved result calculation;

> v6:

>   improved code based on Anoob's comments;

>   fixed some code structure issues;

> v5:

>   fixed some LONG_LINE warnings;

> v4:

>   fixed inaccuracy of the memory footprint display;

> v3:

>   fixed some typos;

> v2:

>   added lcore/dmadev designation;

>   added error case process;

>   removed worker_threads parameter from config.ini;

>   improved the logs;

>   improved config file;

>

>  app/meson.build                        |   1 +

>  app/test-dma-perf/benchmark.c          | 508 ++++++++++++++++++++

>  app/test-dma-perf/config.ini           |  61 +++

>  app/test-dma-perf/main.c               | 616 +++++++++++++++++++++++++

>  app/test-dma-perf/main.h               |  64 +++

>  app/test-dma-perf/meson.build          |  17 +

>  doc/guides/rel_notes/release_23_07.rst |   6 +

>  doc/guides/tools/dmaperf.rst           | 103 +++++

>  doc/guides/tools/index.rst             |   1 +

>  9 files changed, 1377 insertions(+)

>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644

> app/test-dma-perf/config.ini  create mode 100644

> app/test-dma-perf/main.c create mode 100644 app/test-dma-perf/main.h

> create mode 100644 app/test-dma-perf/meson.build  create mode 100644

> doc/guides/tools/dmaperf.rst

>

> diff --git a/app/meson.build b/app/meson.build index

> 74d2420f67..4fc1a83eba 100644

> --- a/app/meson.build

> +++ b/app/meson.build

> @@ -19,6 +19,7 @@ apps = [

>          'test-cmdline',

>          'test-compress-perf',

>          'test-crypto-perf',

> +        'test-dma-perf',

>          'test-eventdev',

>          'test-fib',

>          'test-flow-perf',

> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-

> perf/benchmark.c new file mode 100644 index 0000000000..0601e0d171

> --- /dev/null

> +++ b/app/test-dma-perf/benchmark.c

> @@ -0,0 +1,508 @@

> +/* SPDX-License-Identifier: BSD-3-Clause

> + * Copyright(c) 2023 Intel Corporation  */

> +

> +#include <inttypes.h>

> +#include <stdio.h>

> +#include <stdlib.h>

> +#include <unistd.h>

> +

> +#include <rte_time.h>

> +#include <rte_mbuf.h>

> +#include <rte_dmadev.h>

> +#include <rte_malloc.h>

> +#include <rte_lcore.h>

> +

> +#include "main.h"

> +

> +#define MAX_DMA_CPL_NB 255

> +

> +#define TEST_WAIT_U_SECOND 10000

> +#define POLL_MAX 1000

> +

> +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%"

> PRIu64 ",%.3lf,%.3lf\n"

> +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%"

> PRIu64 ",%.3lf,%.3lf\n"

> +

> +#define CSV_TOTAL_LINE_FMT "Scenario %u

> Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n"

> +

> +struct worker_info {

> +         bool ready_flag;

> +         bool start_flag;

> +         bool stop_flag;

> +         uint32_t total_cpl;

> +         uint32_t test_cpl;

> +};

> +

> +struct lcore_params {

> +         uint8_t scenario_id;

> +         unsigned int lcore_id;

> +         char *dma_name;

> +         uint16_t worker_id;

> +         uint16_t dev_id;

> +         uint32_t nr_buf;

> +         uint16_t kick_batch;

> +         uint32_t buf_size;

> +         uint16_t test_secs;

> +         struct rte_mbuf **srcs;

> +         struct rte_mbuf **dsts;

> +         volatile struct worker_info worker_info; };

> +

> +static struct rte_mempool *src_pool;

> +static struct rte_mempool *dst_pool;

> +

> +static struct lcore_params *lcores[MAX_WORKER_NB];

> +

> +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)

> +

> +static inline int

> +__rte_format_printf(3, 4)

> +print_err(const char *func, int lineno, const char *format, ...) {

> +         va_list ap;

> +         int ret;

> +

> +         ret = fprintf(stderr, "In %s:%d - ", func, lineno);

> +         va_start(ap, format);

> +         ret += vfprintf(stderr, format, ap);

> +         va_end(ap);

> +

> +         return ret;

> +}

> +

> +static inline void

> +calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers,

> uint16_t test_secs,

> +                                                    uint32_t total_cnt, float *memory, uint32_t

> *ave_cycle,

> +                                                    float *bandwidth, float *mops)

> +{

> +         float ops;

> +

> +         *memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 *

> 1024);

> +         *ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;

> +         ops = (float)total_cnt / test_secs;

> +         *mops = ops / (1000 * 1000);

> +         *bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000); }

> +

> +static void

> +output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,

> uint16_t ring_size,

> +                                      uint16_t kick_batch, uint64_t ave_cycle, uint32_t

> buf_size, uint32_t nr_buf,

> +                                      float memory, float bandwidth, float mops, bool

> is_dma) {

> +         if (is_dma)

> +                       printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch

> Size: %u.\n",

> +                                                    lcore_id, dma_name, ring_size, kick_batch);

> +         else

> +                       printf("lcore %u\n", lcore_id);

> +

> +         printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer

> Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",

> +                                      ave_cycle, buf_size, nr_buf, memory,

> rte_get_timer_hz()/1000000000.0);

> +         printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth,

> +mops);

> +

> +         if (is_dma)

> +                       snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,

> CSV_LINE_DMA_FMT,

> +                                      scenario_id, lcore_id, dma_name, ring_size,

> kick_batch, buf_size,

> +                                      nr_buf, memory, ave_cycle, bandwidth, mops);

> +         else

> +                       snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,

> CSV_LINE_CPU_FMT,

> +                                      scenario_id, lcore_id, buf_size,

> +                                      nr_buf, memory, ave_cycle, bandwidth, mops); }

> +

> +static inline void

> +cache_flush_buf(__rte_unused struct rte_mbuf **array,

> +                       __rte_unused uint32_t buf_size,

> +                       __rte_unused uint32_t nr_buf)

> +{

> +#ifdef RTE_ARCH_X86_64

> +         char *data;

> +         struct rte_mbuf **srcs = array;

> +         uint32_t i, offset;

> +

> +         for (i = 0; i < nr_buf; i++) {

> +                       data = rte_pktmbuf_mtod(srcs[i], char *);

> +                       for (offset = 0; offset < buf_size; offset += 64)

> +                                      __builtin_ia32_clflush(data + offset);

> +         }

> +#endif

> +}

> +

> +/* Configuration of device. */

> +static void

> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {

> +         uint16_t vchan = 0;

> +         struct rte_dma_info info;

> +         struct rte_dma_conf dev_config = { .nb_vchans = 1 };

> +         struct rte_dma_vchan_conf qconf = {

> +                       .direction = RTE_DMA_DIR_MEM_TO_MEM,

> +                       .nb_desc = ring_size

> +         };

> +

> +         if (rte_dma_configure(dev_id, &dev_config) != 0)

> +                       rte_exit(EXIT_FAILURE, "Error with dma configure.\n");

> +

> +         if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)

> +                       rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");

> +

> +         if (rte_dma_info_get(dev_id, &info) != 0)

> +                       rte_exit(EXIT_FAILURE, "Error with getting device info.\n");

> +

> +         if (info.nb_vchans != 1)

> +                       rte_exit(EXIT_FAILURE, "Error, no configured queues

> reported on device id. %u\n",

> +                                                    dev_id);

> +

> +         if (rte_dma_start(dev_id) != 0)

> +                       rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }

> +

> +static int

> +config_dmadevs(struct test_configure *cfg) {

> +         uint32_t ring_size = cfg->ring_size.cur;

> +         struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;

> +         uint32_t nb_workers = ldm->cnt;

> +         uint32_t i;

> +         int dev_id;

> +         uint16_t nb_dmadevs = 0;

> +         char *dma_name;

> +

> +         for (i = 0; i < ldm->cnt; i++) {

> +                       dma_name = ldm->dma_names[i];

> +                       dev_id = rte_dma_get_dev_id_by_name(dma_name);

> +                       if (dev_id < 0) {

> +                                      fprintf(stderr, "Error: Fail to find DMA %s.\n",

> dma_name);

> +                                      goto end;

> +                       }

> +

> +                       ldm->dma_ids[i] = dev_id;

> +                       configure_dmadev_queue(dev_id, ring_size);

> +                       ++nb_dmadevs;

> +         }

> +

> +end:

> +         if (nb_dmadevs < nb_workers) {

> +                       printf("Not enough dmadevs (%u) for all workers (%u).\n",

> nb_dmadevs, nb_workers);

> +                       return -1;

> +         }

> +

> +         printf("Number of used dmadevs: %u.\n", nb_dmadevs);

> +

> +         return 0;

> +}

> +

> +static void

> +error_exit(int dev_id)

> +{

> +         rte_dma_stop(dev_id);

> +         rte_dma_close(dev_id);

> +         rte_exit(EXIT_FAILURE, "DMA error\n"); }

> +

> +static inline void

> +do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,

> +                                      volatile struct worker_info *worker_info) {

> +         int ret;

> +         uint16_t nr_cpl;

> +

> +         ret = rte_dma_submit(dev_id, 0);

> +         if (ret < 0)

> +                       error_exit(dev_id);

> +

> +         nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL,

> NULL);

> +         *async_cnt -= nr_cpl;

> +         worker_info->total_cpl += nr_cpl;

> +}

> +

> +static inline int

> +do_dma_mem_copy(void *p)

> +{

> +         struct lcore_params *para = (struct lcore_params *)p;

> +         volatile struct worker_info *worker_info = &(para->worker_info);

> +         const uint16_t dev_id = para->dev_id;

> +         const uint32_t nr_buf = para->nr_buf;

> +         const uint16_t kick_batch = para->kick_batch;

> +         const uint32_t buf_size = para->buf_size;

> +         struct rte_mbuf **srcs = para->srcs;

> +         struct rte_mbuf **dsts = para->dsts;

> +         uint16_t nr_cpl;

> +         uint64_t async_cnt = 0;

> +         uint32_t i;

> +         uint32_t poll_cnt = 0;

> +         int ret;

> +

> +         worker_info->stop_flag = false;

> +         worker_info->ready_flag = true;

> +

> +         while (!worker_info->start_flag)

> +                       ;

> +

> +         while (1) {

> +                       for (i = 0; i < nr_buf; i++) {

> +dma_copy:

> +                                      ret = rte_dma_copy(dev_id, 0,

> rte_mbuf_data_iova(srcs[i]),

> +                                                    rte_mbuf_data_iova(dsts[i]), buf_size, 0);

> +                                      if (unlikely(ret < 0)) {

> +                                                    if (ret == -ENOSPC) {

> +                                                                   do_dma_submit_and_poll(dev_id,

> &async_cnt, worker_info);

> +                                                                   goto dma_copy;

> +                                                    } else

> +                                                                   error_exit(dev_id);

> +                                      }

> +                                      async_cnt++;

> +

> +                                      if ((async_cnt % kick_batch) == 0)

> +                                                    do_dma_submit_and_poll(dev_id,

> &async_cnt, worker_info);

> +                       }

> +

> +                       if (worker_info->stop_flag)

> +                                      break;

> +         }

> +

> +         rte_dma_submit(dev_id, 0);

> +         while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {

> +                       nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,

> NULL, NULL);

> +                       async_cnt -= nr_cpl;

> +         }

> +

> +         return 0;

> +}

> +

> +static inline int

> +do_cpu_mem_copy(void *p)

> +{

> +         struct lcore_params *para = (struct lcore_params *)p;

> +         volatile struct worker_info *worker_info = &(para->worker_info);

> +         const uint32_t nr_buf = para->nr_buf;

> +         const uint32_t buf_size = para->buf_size;

> +         struct rte_mbuf **srcs = para->srcs;

> +         struct rte_mbuf **dsts = para->dsts;

> +         uint32_t i;

> +

> +         worker_info->stop_flag = false;

> +         worker_info->ready_flag = true;

> +

> +         while (!worker_info->start_flag)

> +                       ;

> +

> +         while (1) {

> +                       for (i = 0; i < nr_buf; i++) {

> +                                      /* copy buffer form src to dst */

> +                                      rte_memcpy((void

> *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),

> +                                                    (void

> *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),

> +                                                    (size_t)buf_size);

> +                                      worker_info->total_cpl++;

> +                       }

> +                       if (worker_info->stop_flag)

> +                                      break;

> +         }

> +

> +         return 0;

> +}

> +

> +static int

> +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,

> +                                      struct rte_mbuf ***dsts)

> +{

> +         unsigned int buf_size = cfg->buf_size.cur;

> +         unsigned int nr_sockets;

> +         uint32_t nr_buf = cfg->nr_buf;

> +

> +         nr_sockets = rte_socket_count();

> +         if (cfg->src_numa_node >= nr_sockets ||

> +                       cfg->dst_numa_node >= nr_sockets) {

> +                       printf("Error: Source or destination numa exceeds the acture

> numa nodes.\n");

> +                       return -1;

> +         }

> +

> +         src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",

> +                                      nr_buf,

> +                                      0,

> +                                      0,

> +                                      buf_size + RTE_PKTMBUF_HEADROOM,

> +                                      cfg->src_numa_node);

> +         if (src_pool == NULL) {

> +                       PRINT_ERR("Error with source mempool creation.\n");

> +                       return -1;

> +         }

> +

> +         dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",

> +                                      nr_buf,

> +                                      0,

> +                                      0,

> +                                      buf_size + RTE_PKTMBUF_HEADROOM,

> +                                      cfg->dst_numa_node);

> +         if (dst_pool == NULL) {

> +                       PRINT_ERR("Error with destination mempool creation.\n");

> +                       return -1;

> +         }

> +

> +         *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);

> +         if (*srcs == NULL) {

> +                       printf("Error: srcs malloc failed.\n");

> +                       return -1;

> +         }

> +

> +         *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);

> +         if (*dsts == NULL) {

> +                       printf("Error: dsts malloc failed.\n");

> +                       return -1;

> +         }

> +

> +         if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {

> +                       printf("alloc src mbufs failed.\n");

> +                       return -1;

> +         }

> +

> +         if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {

> +                       printf("alloc dst mbufs failed.\n");

> +                       return -1;

> +         }

> +

> +         return 0;

> +}

> +

> +void

> +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {

> +         uint16_t i;

> +         uint32_t offset;

> +         unsigned int lcore_id = 0;

> +         struct rte_mbuf **srcs = NULL, **dsts = NULL;

> +         struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;

> +         unsigned int buf_size = cfg->buf_size.cur;

> +         uint16_t kick_batch = cfg->kick_batch.cur;

> +         uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /

> (cfg->buf_size.cur * 2);

> +         uint16_t nb_workers = ldm->cnt;

> +         uint16_t test_secs = cfg->test_secs;

> +         float memory = 0;

> +         uint32_t avg_cycles = 0;

> +         uint32_t avg_cycles_total;

> +         float mops, mops_total;

> +         float bandwidth, bandwidth_total;

> +

> +         if (setup_memory_env(cfg, &srcs, &dsts) < 0)

> +                       goto out;

> +

> +         if (is_dma)

> +                       if (config_dmadevs(cfg) < 0)

> +                                      goto out;

> +

> +         if (cfg->cache_flush == 1) {

> +                       cache_flush_buf(srcs, buf_size, nr_buf);

> +                       cache_flush_buf(dsts, buf_size, nr_buf);

> +                       rte_mb();

> +         }

> +

> +         printf("Start testing....\n");

> +

> +         for (i = 0; i < nb_workers; i++) {

> +                       lcore_id = ldm->lcores[i];

> +                       offset = nr_buf / nb_workers * i;

> +                       lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);

> +                       if (lcores[i] == NULL) {

> +                                      printf("lcore parameters malloc failure for

> lcore %d\n", lcore_id);

> +                                      break;

> +                       }

> +                       if (is_dma) {

> +                                     lcores[i]->dma_name = ldm->dma_names[i];

> +                                      lcores[i]->dev_id = ldm->dma_ids[i];

> +                                      lcores[i]->kick_batch = kick_batch;

> +                       }

> +                       lcores[i]->worker_id = i;

> +                       lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);

> +                       lcores[i]->buf_size = buf_size;

> +                       lcores[i]->test_secs = test_secs;

> +                       lcores[i]->srcs = srcs + offset;

> +                       lcores[i]->dsts = dsts + offset;

> +                       lcores[i]->scenario_id = cfg->scenario_id;

> +                       lcores[i]->lcore_id = lcore_id;

> +

> +                       if (is_dma)

> +                                      rte_eal_remote_launch(do_dma_mem_copy, (void

> *)(lcores[i]), lcore_id);

> +                       else

> +                                      rte_eal_remote_launch(do_cpu_mem_copy, (void

> *)(lcores[i]), lcore_id);

> +         }

> +

> +         while (1) {

> +                       bool ready = true;

> +                       for (i = 0; i < nb_workers; i++) {

> +                                      if (lcores[i]->worker_info.ready_flag == false) {

> +                                                    ready = 0;

> +                                                    break;

> +                                      }

> +                       }

> +                       if (ready)

> +                                      break;

> +         }

> +

> +         for (i = 0; i < nb_workers; i++)

> +                       lcores[i]->worker_info.start_flag = true;

> +

> +         usleep(TEST_WAIT_U_SECOND);

> +         for (i = 0; i < nb_workers; i++)

> +                       lcores[i]->worker_info.test_cpl = lcores[i]-

> >worker_info.total_cpl;

> +

> +         usleep(test_secs * 1000 * 1000);

> +         for (i = 0; i < nb_workers; i++)

> +                       lcores[i]->worker_info.test_cpl = lcores[i]-

> >worker_info.total_cpl -

> +                                                                                 lcores[i]-

> >worker_info.test_cpl;

> +

> +         for (i = 0; i < nb_workers; i++)

> +                       lcores[i]->worker_info.stop_flag = true;

> +

> +         rte_eal_mp_wait_lcore();

> +

> +         mops_total = 0;

> +         bandwidth_total = 0;

> +         avg_cycles_total = 0;

> +         for (i = 0; i < nb_workers; i++) {

> +                       calc_result(buf_size, nr_buf, nb_workers, test_secs,

> +                                      lcores[i]->worker_info.test_cpl,

> +                                      &memory, &avg_cycles, &bandwidth, &mops);

> +                       output_result(cfg->scenario_id, lcores[i]->lcore_id,

> +                                                                   lcores[i]->dma_name, cfg-

> >ring_size.cur, kick_batch,

> +                                                                   avg_cycles, buf_size, nr_buf /

> nb_workers, memory,

> +                                                                   bandwidth, mops, is_dma);

> +                       mops_total += mops;

> +                       bandwidth_total += bandwidth;

> +                       avg_cycles_total += avg_cycles;

> +         }

> +         printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n",

> bandwidth_total, mops_total);

> +         snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN,

> CSV_TOTAL_LINE_FMT,

> +                                      cfg->scenario_id, nr_buf, memory * nb_workers,

> +                                      avg_cycles_total / nb_workers, bandwidth_total,

> mops_total);

> +

> +out:

> +         /* free mbufs used in the test */

> +         if (srcs != NULL)

> +                       rte_pktmbuf_free_bulk(srcs, nr_buf);

> +         if (dsts != NULL)

> +                       rte_pktmbuf_free_bulk(dsts, nr_buf);

> +

> +         /* free the points for the mbufs */

> +         rte_free(srcs);

> +         srcs = NULL;

> +         rte_free(dsts);

> +         dsts = NULL;

> +

> +         rte_mempool_free(src_pool);

> +         src_pool = NULL;

> +

> +         rte_mempool_free(dst_pool);

> +         dst_pool = NULL;

> +

> +         /* free the worker parameters */

> +         for (i = 0; i < nb_workers; i++) {

> +                       rte_free(lcores[i]);

> +                       lcores[i] = NULL;

> +         }

> +

> +         if (is_dma) {

> +                       for (i = 0; i < nb_workers; i++) {

> +                                      printf("Stopping dmadev %d\n", ldm->dma_ids[i]);

> +                                      rte_dma_stop(ldm->dma_ids[i]);

> +                       }

> +         }

> +}

> diff --git a/app/test-dma-perf/config.ini

> b/app/test-dma-perf/config.ini new file mode 100644 index

> 0000000000..b550f4b23f

> --- /dev/null

> +++ b/app/test-dma-perf/config.ini

> @@ -0,0 +1,61 @@

> +

> +; This is an example configuration file for dma-perf, which details

> +the meanings of each parameter ; and instructions on how to use dma-perf.

> +

> +; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.

> +

> +; Parameters:

> +; "mem_size" denotes the size of the memory footprint.

> +; "buf_size" denotes the memory size of a single operation.

> +; "dma_ring_size" denotes the dma ring buffer size. It should be must

> +be a power of two, and between ;  64 and 4096.

> +; "kick_batch" denotes the dma operation batch size, and should be

> +greater

> than 1 normally.

> +

> +; The format for variables is variable=first,last,increment,ADD|MUL.

> +

> +; src_numa_node is used to control the numa node where the source

> memory is allocated.

> +; dst_numa_node is used to control the numa node where the

> +destination

> memory is allocated.

> +

> +; cache_flush is used to determine whether or not the cache should be

> +flushed, with 1 indicating to ; flush and 0 indicating to not flush.

> +

> +; test_seconds controls the test time of the whole case.

> +

> +; To use DMA for a test, please specify the "lcore_dma" parameter.

> +; If you have already set the "-l" and "-a" parameters using EAL, ;

> +make sure that the value of "lcore_dma" falls within their range of

> +the

> values.

> +; We have to ensure a 1:1 mapping between the core and DMA device.

> +

> +; To use CPU for a test, please specify the "lcore" parameter.

> +; If you have already set the "-l" and "-a" parameters using EAL, ;

> +make sure that the value of "lcore" falls within their range of values.

> +

> +; To specify a configuration file, use the "--config" flag followed

> +by the path

> to the file.

> +

> +; To specify a result file, use the "--result" flag followed by the

> +path to the

> file.

> +; If you do not specify a result file, one will be generated with the

> +same name as the configuration ; file, with the addition of

> +"_result.csv" at

> the end.

> +

> +[case1]

> +type=DMA_MEM_COPY

> +mem_size=10

> +buf_size=64,8192,2,MUL

> +dma_ring_size=1024

> +kick_batch=32

> +src_numa_node=0

> +dst_numa_node=0

> +cache_flush=0

> +test_seconds=2

> +lcore_dma=lcore10@0000:00:04.2<mailto:+lcore_dma=lcore10@0000:00:04.2>, lcore11@0000:00:04.3<mailto:lcore11@0000:00:04.3>

> +eal_args=--in-memory --file-prefix=test

> +

> +[case2]

> +type=CPU_MEM_COPY

> +mem_size=10

> +buf_size=64,8192,2,MUL

> +src_numa_node=0

> +dst_numa_node=1

> +cache_flush=0

> +test_seconds=2

> +lcore = 3, 4

> +eal_args=--in-memory --no-pci

> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new

> file mode 100644 index 0000000000..de37120df6

> --- /dev/null

> +++ b/app/test-dma-perf/main.c

> @@ -0,0 +1,616 @@

> +/* SPDX-License-Identifier: BSD-3-Clause

> + * Copyright(c) 2023 Intel Corporation  */

> +

> +#include <stdio.h>

> +#include <stdlib.h>

> +#include <getopt.h>

> +#include <signal.h>

> +#include <stdbool.h>

> +#include <unistd.h>

> +#include <sys/wait.h>

> +#include <inttypes.h>

> +#include <libgen.h>

> +

> +#include <rte_eal.h>

> +#include <rte_cfgfile.h>

> +#include <rte_string_fns.h>

> +#include <rte_lcore.h>

> +

> +#include "main.h"

> +

> +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,DMA ring size,kick batch

> size,buffer size(B),number of buffers,memory(MB),average

> cycle,bandwidth(Gbps),MOps\n"

> +

> +#define MAX_EAL_PARAM_NB 100

> +#define MAX_EAL_PARAM_LEN 1024

> +

> +#define DMA_MEM_COPY "DMA_MEM_COPY"

> +#define CPU_MEM_COPY "CPU_MEM_COPY"

> +

> +#define CMDLINE_CONFIG_ARG "--config"

> +#define CMDLINE_RESULT_ARG "--result"

> +

> +#define MAX_PARAMS_PER_ENTRY 4

> +

> +#define MAX_LONG_OPT_SZ 64

> +

> +enum {

> +         TEST_TYPE_NONE = 0,

> +         TEST_TYPE_DMA_MEM_COPY,

> +         TEST_TYPE_CPU_MEM_COPY

> +};

> +

> +#define MAX_TEST_CASES 16

> +static struct test_configure test_cases[MAX_TEST_CASES];

> +

> +char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];

> +

> +static FILE *fd;

> +

> +static void

> +output_csv(bool need_blankline)

> +{

> +         uint32_t i;

> +

> +         if (need_blankline) {

> +                       fprintf(fd, ",,,,,,,,\n");

> +                       fprintf(fd, ",,,,,,,,\n");

> +         }

> +

> +         for (i = 0; i < RTE_DIM(output_str); i++) {

> +                       if (output_str[i][0]) {

> +                                      fprintf(fd, "%s", output_str[i]);

> +                                      output_str[i][0] = '\0';

> +                       }

> +         }

> +

> +         fflush(fd);

> +}

> +

> +static void

> +output_env_info(void)

> +{

> +         snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "Test

> Environment:\n");

> +         snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU

> frequency,%.3lf Ghz",

> +                                      rte_get_timer_hz() / 1000000000.0);

> +

> +         output_csv(true);

> +}

> +

> +static void

> +output_header(uint32_t case_id, struct test_configure *case_cfg) {

> +         snprintf(output_str[0], MAX_OUTPUT_STR_LEN,

> +                                      CSV_HDR_FMT, case_id, case_cfg->test_type_str);

> +

> +         output_csv(true);

> +}

> +

> +static void

> +run_test_case(struct test_configure *case_cfg) {

> +         switch (case_cfg->test_type) {

> +         case TEST_TYPE_DMA_MEM_COPY:

> +                       mem_copy_benchmark(case_cfg, true);

> +                       break;

> +         case TEST_TYPE_CPU_MEM_COPY:

> +                       mem_copy_benchmark(case_cfg, false);

> +                       break;

> +         default:

> +                       printf("Unknown test type. %s\n", case_cfg->test_type_str);

> +                       break;

> +         }

> +}

> +

> +static void

> +run_test(uint32_t case_id, struct test_configure *case_cfg) {

> +         uint32_t i;

> +         uint32_t nb_lcores = rte_lcore_count();

> +         struct test_configure_entry *mem_size = &case_cfg->mem_size;

> +         struct test_configure_entry *buf_size = &case_cfg->buf_size;

> +         struct test_configure_entry *ring_size = &case_cfg->ring_size;

> +         struct test_configure_entry *kick_batch = &case_cfg->kick_batch;

> +         struct test_configure_entry dummy = { 0 };

> +         struct test_configure_entry *var_entry = &dummy;

> +

> +         for (i = 0; i < RTE_DIM(output_str); i++)

> +                       memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);

> +

> +         if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {

> +                       printf("Case %u: Not enough lcores.\n", case_id);

> +                       return;

> +         }

> +

> +         printf("Number of used lcores: %u.\n", nb_lcores);

> +

> +         if (mem_size->incr != 0)

> +                       var_entry = mem_size;

> +

> +         if (buf_size->incr != 0)

> +                       var_entry = buf_size;

> +

> +         if (ring_size->incr != 0)

> +                       var_entry = ring_size;

> +

> +         if (kick_batch->incr != 0)

> +                       var_entry = kick_batch;

> +

> +         case_cfg->scenario_id = 0;

> +

> +         output_header(case_id, case_cfg);

> +

> +         for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry-

> >last;) {

> +                       case_cfg->scenario_id++;

> +                       printf("\nRunning scenario %d\n", case_cfg->scenario_id);

> +

> +                       run_test_case(case_cfg);

> +                       output_csv(false);

> +

> +                       if (var_entry->op == OP_ADD)

> +                                      var_entry->cur += var_entry->incr;

> +                       else if (var_entry->op == OP_MUL)

> +                                      var_entry->cur *= var_entry->incr;

> +                       else {

> +                                      printf("No proper operation for variable entry.\n");

> +                                      break;

> +                       }

> +         }

> +}

> +

> +static int

> +parse_lcore(struct test_configure *test_case, const char *value) {

> +         uint16_t len;

> +         char *input;

> +         struct lcore_dma_map_t *lcore_dma_map;

> +

> +         if (test_case == NULL || value == NULL)

> +                       return -1;

> +

> +         len = strlen(value);

> +         input = (char *)malloc((len + 1) * sizeof(char));

> +         strlcpy(input, value, len + 1);

> +         lcore_dma_map = &(test_case->lcore_dma_map);

> +

> +         memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));

> +

> +         char *token = strtok(input, ", ");

> +         while (token != NULL) {

> +                       if (lcore_dma_map->cnt >= MAX_LCORE_NB) {

> +                                      free(input);

> +                                      return -1;

> +                       }

> +

> +                       uint16_t lcore_id = atoi(token);

> +                       lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;

> +

> +                       token = strtok(NULL, ", ");

> +         }

> +

> +         free(input);

> +         return 0;

> +}

> +

> +static int

> +parse_lcore_dma(struct test_configure *test_case, const char *value) {

> +         struct lcore_dma_map_t *lcore_dma_map;

> +         char *input, *addrs;

> +         char *ptrs[2];

> +         char *start, *end, *substr;

> +         uint16_t lcore_id;

> +         int ret = 0;

> +

> +         if (test_case == NULL || value == NULL)

> +                       return -1;

> +

> +         input = strndup(value, strlen(value) + 1);

> +         addrs = input;

> +

> +         while (*addrs == '\0')

> +                       addrs++;

> +         if (*addrs == '\0') {

> +                       fprintf(stderr, "No input DMA addresses\n");

> +                       ret = -1;

> +                       goto out;

> +         }

> +

> +         substr = strtok(addrs, ",");

> +         if (substr == NULL) {

> +                       fprintf(stderr, "No input DMA address\n");

> +                       ret = -1;

> +                       goto out;

> +         }

> +

> +         memset(&test_case->lcore_dma_map, 0, sizeof(struct

> lcore_dma_map_t));

> +

> +         do {

> +                       if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {

> +                                      fprintf(stderr, "Illegal DMA address\n");

> +                                      ret = -1;

> +                                      break;

> +                       }

> +

> +                       start = strstr(ptrs[0], "lcore");

> +                       if (start == NULL) {

> +                                      fprintf(stderr, "Illegal lcore\n");

> +                                      ret = -1;

> +                                      break;

> +                       }

> +

> +                       start += 5;

> +                       lcore_id = strtol(start, &end, 0);

> +                       if (end == start) {

> +                                      fprintf(stderr, "No input lcore ID or ID %d is wrong\n",

> lcore_id);

> +                                      ret = -1;

> +                                      break;

> +                       }

> +

> +                       lcore_dma_map = &test_case->lcore_dma_map;

> +                       if (lcore_dma_map->cnt >= MAX_LCORE_NB) {

> +                                      fprintf(stderr, "lcores count error\n");

> +                                      ret = -1;

> +                                      break;

> +                       }

> +

> +                       lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;

> +                       strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt],

> ptrs[1],

> +                                                    RTE_DEV_NAME_MAX_LEN);

> +                       lcore_dma_map->cnt++;

> +                       substr = strtok(NULL, ",");

> +         } while (substr != NULL);

> +

> +out:

> +         free(input);

> +         return ret;

> +}

> +

> +static int

> +parse_entry(const char *value, struct test_configure_entry *entry) {

> +         char input[255] = {0};

> +         char *args[MAX_PARAMS_PER_ENTRY];

> +         int args_nr = -1;

> +         int ret;

> +

> +         if (value == NULL || entry == NULL)

> +                       goto out;

> +

> +         strncpy(input, value, 254);

> +         if (*input == '\0')

> +                       goto out;

> +

> +         ret = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY,

> ',');

> +         if (ret != 1 && ret != 4)

> +                       goto out;

> +

> +         entry->cur = entry->first = (uint32_t)atoi(args[0]);

> +

> +         if (ret == 4) {

> +                       args_nr = 4;

> +                       entry->last = (uint32_t)atoi(args[1]);

> +                       entry->incr = (uint32_t)atoi(args[2]);

> +                       if (!strcmp(args[3], "MUL"))

> +                                      entry->op = OP_MUL;

> +                       else if (!strcmp(args[3], "ADD"))

> +                                      entry->op = OP_ADD;

> +                       else {

> +                                      args_nr = -1;

> +                                      printf("Invalid op %s.\n", args[3]);

> +                       }

> +

> +         } else {

> +                       args_nr = 1;

> +                       entry->op = OP_NONE;

> +                       entry->last = 0;

> +                       entry->incr = 0;

> +         }

> +out:

> +         return args_nr;

> +}

> +

> +static uint16_t

> +load_configs(const char *path)

> +{

> +         struct rte_cfgfile *cfgfile;

> +         int nb_sections, i;

> +         struct test_configure *test_case;

> +         char section_name[CFG_NAME_LEN];

> +         const char *case_type;

> +         const char *lcore_dma;

> +         const char *mem_size_str, *buf_size_str, *ring_size_str,

> *kick_batch_str;

> +         int args_nr, nb_vp;

> +         bool is_dma;

> +

> +         printf("config file parsing...\n");

> +         cfgfile = rte_cfgfile_load(path, 0);

> +         if (!cfgfile) {

> +                       printf("Open configure file error.\n");

> +                       exit(1);

> +         }

> +

> +         nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);

> +         if (nb_sections > MAX_TEST_CASES) {

> +                       printf("Error: The maximum number of cases is %d.\n",

> MAX_TEST_CASES);

> +                       exit(1);

> +         }

> +

> +         for (i = 0; i < nb_sections; i++) {

> +                       snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);

> +                       test_case = &test_cases[i];

> +                       case_type = rte_cfgfile_get_entry(cfgfile, section_name,

> "type");

> +                       if (case_type == NULL) {

> +                                      printf("Error: No case type in case %d, the test will be

> finished here.\n",

> +                                                    i + 1);

> +                                      test_case->is_valid = false;

> +                                      continue;

> +                       }

> +

> +                       if (strcmp(case_type, DMA_MEM_COPY) == 0) {

> +                                      test_case->test_type = TEST_TYPE_DMA_MEM_COPY;

> +                                      test_case->test_type_str = DMA_MEM_COPY;

> +                                      is_dma = true;

> +                       } else if (strcmp(case_type, CPU_MEM_COPY) == 0) {

> +                                      test_case->test_type = TEST_TYPE_CPU_MEM_COPY;

> +                                      test_case->test_type_str = CPU_MEM_COPY;

> +                                      is_dma = false;

> +                       } else {

> +                                      printf("Error: Wrong test case type %s in case%d.\n",

> case_type, i + 1);

> +                                      test_case->is_valid = false;

> +                                      continue;

> +                       }

> +

> +                       test_case->src_numa_node =

> (int)atoi(rte_cfgfile_get_entry(cfgfile,

> +                                                                                                             section_name,

> "src_numa_node"));

> +                       test_case->dst_numa_node =

> (int)atoi(rte_cfgfile_get_entry(cfgfile,

> +                                                                                                             section_name,

> "dst_numa_node"));

> +                       nb_vp = 0;

> +                       mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name,

> "mem_size");

> +                       args_nr = parse_entry(mem_size_str, &test_case-

> >mem_size);

> +                       if (args_nr < 0) {

> +                                      printf("parse error in case %d.\n", i + 1);

> +                                      test_case->is_valid = false;

> +                                      continue;

> +                       } else if (args_nr == 4)

> +                                      nb_vp++;

> +

> +                       buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name,

> "buf_size");

> +                       args_nr = parse_entry(buf_size_str, &test_case->buf_size);

> +                       if (args_nr < 0) {

> +                                      printf("parse error in case %d.\n", i + 1);

> +                                      test_case->is_valid = false;

> +                                      continue;

> +                       } else if (args_nr == 4)

> +                                      nb_vp++;

> +

> +                       if (is_dma) {

> +                                      ring_size_str = rte_cfgfile_get_entry(cfgfile,

> section_name,

> +

>            "dma_ring_size");

> +                                      args_nr = parse_entry(ring_size_str, &test_case-

> >ring_size);

> +                                      if (args_nr < 0) {

> +                                                    printf("parse error in case %d.\n", i + 1);

> +                                                    test_case->is_valid = false;

> +                                                    continue;

> +                                      } else if (args_nr == 4)

> +                                                    nb_vp++;

> +

> +                                      kick_batch_str = rte_cfgfile_get_entry(cfgfile,

> section_name, "kick_batch");

> +                                      args_nr = parse_entry(kick_batch_str, &test_case-

> >kick_batch);

> +                                      if (args_nr < 0) {

> +                                                    printf("parse error in case %d.\n", i + 1);

> +                                                    test_case->is_valid = false;

> +                                                    continue;

> +                                      } else if (args_nr == 4)

> +                                                    nb_vp++;

> +

> +                                      lcore_dma = rte_cfgfile_get_entry(cfgfile,

> section_name, "lcore_dma");

> +                                      int lcore_ret = parse_lcore_dma(test_case,

> lcore_dma);

> +                                      if (lcore_ret < 0) {

> +                                                    printf("parse lcore dma error in case %d.\n",

> i + 1);

> +                                                    test_case->is_valid = false;

> +                                                    continue;

> +                                      }

> +                       } else {

> +                                      lcore_dma = rte_cfgfile_get_entry(cfgfile,

> section_name, "lcore");

> +                                      int lcore_ret = parse_lcore(test_case, lcore_dma);

> +                                      if (lcore_ret < 0) {

> +                                                    printf("parse lcore error in case %d.\n", i + 1);

> +                                                    test_case->is_valid = false;

> +                                                    continue;

> +                                      }

> +                       }

> +

> +                       if (nb_vp > 1) {

> +                                      printf("Case %d error, each section can only have a

> single variable parameter.\n",

> +                                                                   i + 1);

> +                                      test_case->is_valid = false;

> +                                      continue;

> +                       }

> +

> +                       test_case->cache_flush =

> +                                      (uint8_t)atoi(rte_cfgfile_get_entry(cfgfile,

> section_name, "cache_flush"));

> +                       test_case->test_secs =

> (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,

> +                                                                   section_name, "test_seconds"));

> +

> +                       test_case->eal_args = rte_cfgfile_get_entry(cfgfile,

> section_name, "eal_args");

> +                       test_case->is_valid = true;

> +         }

> +

> +         rte_cfgfile_close(cfgfile);

> +         printf("config file parsing complete.\n\n");

> +         return i;

> +}

> +

> +/* Parse the argument given in the command line of the application */

> +static int append_eal_args(int argc, char **argv, const char

> +*eal_args, char **new_argv) {

> +         int i;

> +         char *tokens[MAX_EAL_PARAM_NB];

> +         char args[MAX_EAL_PARAM_LEN] = {0};

> +         int token_nb, new_argc = 0;

> +

> +         for (i = 0; i < argc; i++) {

> +                       if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||

> +                                                    (strcmp(argv[i], CMDLINE_RESULT_ARG) == 0))

> {

> +                                      i++;

> +                                      continue;

> +                       }

> +                       strlcpy(new_argv[new_argc], argv[i], MAX_EAL_PARAM_LEN);

> +                       new_argc++;

> +         }

> +

> +         if (eal_args) {

> +                       strlcpy(args, eal_args, MAX_EAL_PARAM_LEN);

> +                       token_nb = rte_strsplit(args, strlen(args),

> +                                                                   tokens, MAX_EAL_PARAM_NB, ' ');

> +                       for (i = 0; i < token_nb; i++)

> +                                      strlcpy(new_argv[new_argc++], tokens[i],

> MAX_EAL_PARAM_LEN);

> +         }

> +

> +         return new_argc;

> +}

> +

> +int

> +main(int argc, char *argv[])

> +{

> +         int ret;

> +         uint16_t case_nb;

> +         uint32_t i, nb_lcores;

> +         pid_t cpid, wpid;

> +         int wstatus;

> +         char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];

> +         char *pargs[MAX_EAL_PARAM_NB];

> +         char *cfg_path_ptr = NULL;

> +         char *rst_path_ptr = NULL;

> +         char rst_path[PATH_MAX];

> +         int new_argc;

> +

> +         memset(args, 0, sizeof(args));

> +

> +         for (i = 0; i < RTE_DIM(pargs); i++)

> +                       pargs[i] = args[i];

> +

> +         for (i = 0; i < (uint32_t)argc; i++) {

> +                       if (strncmp(argv[i], CMDLINE_CONFIG_ARG,

> MAX_LONG_OPT_SZ) == 0)

> +                                      cfg_path_ptr = argv[i + 1];

> +                       if (strncmp(argv[i], CMDLINE_RESULT_ARG,

> MAX_LONG_OPT_SZ) == 0)

> +                                      rst_path_ptr = argv[i + 1];

> +         }

> +         if (cfg_path_ptr == NULL) {

> +                       printf("Config file not assigned.\n");

> +                       return -1;

> +         }

> +         if (rst_path_ptr == NULL) {

> +                       strlcpy(rst_path, cfg_path_ptr, PATH_MAX);

> +                       char *token = strtok(basename(rst_path), ".");

> +                       if (token == NULL) {

> +                                      printf("Config file error.\n");

> +                                      return -1;

> +                       }

> +                       strcat(token, "_result.csv");

> +                       rst_path_ptr = rst_path;

> +         }

> +

> +         case_nb = load_configs(cfg_path_ptr);

> +         fd = fopen(rst_path_ptr, "w");

> +         if (fd == NULL) {

> +                       printf("Open output CSV file error.\n");

> +                       return -1;

> +         }

> +         fclose(fd);

> +

> +         printf("Running cases...\n");

> +         for (i = 0; i < case_nb; i++) {

> +                       if (!test_cases[i].is_valid) {

> +                                      printf("Invalid test case %d.\n\n", i + 1);

> +                                      snprintf(output_str[0], MAX_OUTPUT_STR_LEN,

> "Invalid case %d\n", i +

> +1);

> +

> +                                      fd = fopen(rst_path_ptr, "a");

> +                                      if (!fd) {

> +                                                    printf("Open output CSV file error.\n");

> +                                                    return 0;

> +                                      }

> +                                      output_csv(true);

> +                                      fclose(fd);

> +                                      continue;

> +                       }

> +

> +                       if (test_cases[i].test_type == TEST_TYPE_NONE) {

> +                                      printf("No valid test type in test case %d.\n\n", i + 1);

> +                                      snprintf(output_str[0], MAX_OUTPUT_STR_LEN,

> "Invalid case %d\n", i +

> +1);

> +

> +                                      fd = fopen(rst_path_ptr, "a");

> +                                      if (!fd) {

> +                                                    printf("Open output CSV file error.\n");

> +                                                    return 0;

> +                                      }

> +                                      output_csv(true);

> +                                      fclose(fd);

> +                                      continue;

> +                       }

> +

> +                       cpid = fork();

> +                       if (cpid < 0) {

> +                                      printf("Fork case %d failed.\n", i + 1);

> +                                      exit(EXIT_FAILURE);

> +                       } else if (cpid == 0) {

> +                                      printf("\nRunning case %u\n\n", i + 1);

> +

> +                                      new_argc = append_eal_args(argc, argv,

> test_cases[i].eal_args, pargs);

> +                                      ret = rte_eal_init(new_argc, pargs);

> +                                      if (ret < 0)

> +                                                    rte_exit(EXIT_FAILURE, "Invalid EAL

> arguments\n");

> +

> +                                      /* Check lcores. */

> +                                      nb_lcores = rte_lcore_count();

> +                                      if (nb_lcores < 2)

> +                                                    rte_exit(EXIT_FAILURE,

> +                                                                   "There should be at least 2 worker

> lcores.\n");

> +

> +                                      fd = fopen(rst_path_ptr, "a");

> +                                      if (!fd) {

> +                                                    printf("Open output CSV file error.\n");

> +                                                    return 0;

> +                                      }

> +

> +                                      output_env_info();

> +

> +                                      run_test(i + 1, &test_cases[i]);

> +

> +                                      /* clean up the EAL */

> +                                     rte_eal_cleanup();

> +

> +                                      fclose(fd);

> +

> +                                      printf("\nCase %u completed.\n\n", i + 1);

> +

> +                                      exit(EXIT_SUCCESS);

> +                       } else {

> +                                      wpid = waitpid(cpid, &wstatus, 0);

> +                                      if (wpid == -1) {

> +                                                    printf("waitpid error.\n");

> +                                                    exit(EXIT_FAILURE);

> +                                      }

> +

> +                                      if (WIFEXITED(wstatus))

> +                                                    printf("Case process exited. status %d\n\n",

> +                                                                   WEXITSTATUS(wstatus));

> +                                      else if (WIFSIGNALED(wstatus))

> +                                                    printf("Case process killed by signal %d\n\n",

> +                                                                   WTERMSIG(wstatus));

> +                                      else if (WIFSTOPPED(wstatus))

> +                                                    printf("Case process stopped by

> signal %d\n\n",

> +                                                                   WSTOPSIG(wstatus));

> +                                      else if (WIFCONTINUED(wstatus))

> +                                                    printf("Case process continued.\n\n");

> +                                      else

> +                                                    printf("Case process unknown

> terminated.\n\n");

> +                       }

> +         }

> +

> +         printf("Bye...\n");

> +         return 0;

> +}

> +

> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new

> file mode 100644 index 0000000000..12bc3f4e3f

> --- /dev/null

> +++ b/app/test-dma-perf/main.h

> @@ -0,0 +1,64 @@

> +/* SPDX-License-Identifier: BSD-3-Clause

> + * Copyright(c) 2023 Intel Corporation  */

> +

> +#ifndef _MAIN_H_

> +#define _MAIN_H_

> +

> +

> +#include <rte_common.h>

> +#include <rte_cycles.h>

> +#include <rte_dev.h>

> +

> +#define MAX_WORKER_NB 128

> +#define MAX_OUTPUT_STR_LEN 512

> +

> +#define MAX_DMA_NB 128

> +#define MAX_LCORE_NB 256

> +

> +extern char output_str[MAX_WORKER_NB + 1][MAX_OUTPUT_STR_LEN];

> +

> +typedef enum {

> +         OP_NONE = 0,

> +         OP_ADD,

> +         OP_MUL

> +} alg_op_type;

> +

> +struct test_configure_entry {

> +         uint32_t first;

> +         uint32_t last;

> +         uint32_t incr;

> +         alg_op_type op;

> +         uint32_t cur;

> +};

> +

> +struct lcore_dma_map_t {

> +         uint32_t lcores[MAX_WORKER_NB];

> +         char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];

> +         int16_t dma_ids[MAX_WORKER_NB];

> +         uint16_t cnt;

> +};

> +

> +struct test_configure {

> +         bool is_valid;

> +         uint8_t test_type;

> +         const char *test_type_str;

> +         uint16_t src_numa_node;

> +         uint16_t dst_numa_node;

> +         uint16_t opcode;

> +         bool is_dma;

> +         struct lcore_dma_map_t lcore_dma_map;

> +         struct test_configure_entry mem_size;

> +         struct test_configure_entry buf_size;

> +         struct test_configure_entry ring_size;

> +         struct test_configure_entry kick_batch;

> +         uint8_t cache_flush;

> +         uint32_t nr_buf;

> +         uint16_t test_secs;

> +         const char *eal_args;

> +         uint8_t scenario_id;

> +};

> +

> +void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);

> +

> +#endif /* _MAIN_H_ */

> diff --git a/app/test-dma-perf/meson.build b/app/test-dma-

> perf/meson.build new file mode 100644 index 0000000000..bd6c264002

> --- /dev/null

> +++ b/app/test-dma-perf/meson.build

> @@ -0,0 +1,17 @@

> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023

> +Intel Corporation

> +

> +# meson file, for building this app as part of a main DPDK build.

> +

> +if is_windows

> +    build = false

> +    reason = 'not supported on Windows'

> +    subdir_done()

> +endif

> +

> +deps += ['dmadev', 'mbuf', 'cfgfile']

> +

> +sources = files(

> +        'main.c',

> +        'benchmark.c',

> +)

> diff --git a/doc/guides/rel_notes/release_23_07.rst

> b/doc/guides/rel_notes/release_23_07.rst

> index 4459144140..796cc5517d 100644

> --- a/doc/guides/rel_notes/release_23_07.rst

> +++ b/doc/guides/rel_notes/release_23_07.rst

> @@ -200,6 +200,12 @@ New Features

>

>    Enhanced the GRO library to support TCP packets over IPv6 network.

>

> +* **Added DMA device performance test application.**

> +

> +  Added an new application to test the performance of DMA device and CPU.

> +

> +  See the :doc:`../tools/dmaperf` for more details.

> +

>

>  Removed Items

>  -------------

> diff --git a/doc/guides/tools/dmaperf.rst

> b/doc/guides/tools/dmaperf.rst new file mode 100644 index

> 0000000000..c5f8a9406f

> --- /dev/null

> +++ b/doc/guides/tools/dmaperf.rst

> @@ -0,0 +1,103 @@

> +..  SPDX-License-Identifier: BSD-3-Clause

> +    Copyright(c) 2023 Intel Corporation.

> +

> +dpdk-test-dma-perf Application

> +==============================

> +

> +The ``dpdk-test-dma-perf`` tool is a Data Plane Development Kit

> +(DPDK) application that enables testing the performance of DMA

> +(Direct Memory

> +Access) devices available within DPDK. It provides a test framework

> +to assess the performance of CPU and DMA devices under various

> +scenarios, such as varying buffer lengths. Doing so provides insight

> +into the potential performance when using these DMA devices for

> +acceleration in DPDK applications. It supports memory copy

> +performance tests for now,

> comparing the performance of CPU and DMA automatically in various

> conditions with the help of a pre-set configuration file.

> +

> +

> +Configuration

> +-------------

> +This application uses inherent DPDK EAL command-line options as well

> +as custom command-line options in the application. An example

> +configuration file for the application is provided and gives the

> +meanings for

> each parameter.

> +

> +Here is an extracted sample from the configuration file (the complete

> +sample can be found in the application source directory):

> +

> +.. code-block:: ini

> +

> +   [case1]

> +   type=DMA_MEM_COPY

> +   mem_size=10

> +   buf_size=64,8192,2,MUL

> +   dma_ring_size=1024

> +   kick_batch=32

> +   src_numa_node=0

> +   dst_numa_node=0

> +   cache_flush=0

> +   test_seconds=2

> +   lcore_dma=lcore10@0000:00:04.2<mailto:lcore_dma=lcore10@0000:00:04.2>, lcore11@0000:00:04.3<mailto:lcore11@0000:00:04.3>

> +   eal_args=--in-memory --file-prefix=test

> +

> +   [case2]

> +   type=CPU_MEM_COPY

> +   mem_size=10

> +   buf_size=64,8192,2,MUL

> +   src_numa_node=0

> +   dst_numa_node=1

> +   cache_flush=0

> +   test_seconds=2

> +   lcore = 3, 4

> +   eal_args=--in-memory --no-pci

> +

> +The configuration file is divided into multiple sections, each

> +section

> represents a test case.

> +The four variables mem_size, buf_size, dma_ring_size, and kick_batch

> +can

> vary in each test case.

> +The format for this is ``variable=first,last,increment,ADD\|MUL``.

> +This means that the first value of the variable is 'first', the last

> +value is 'last', 'increment' is the step size, and ADD|MUL indicates

> +whether the change is by addition or multiplication. Each case can

> +only have one

> variable change, and each change will generate a scenario, so each

> case can have multiple scenarios.

> +

> +Parameter Definitions

> +---------------------

> +

> +- **type**: The type of the test. Currently supported types are

> `DMA_MEM_COPY` and `CPU_MEM_COPY`.

> +- **mem_size**: The size of the memory footprint.

> +- **buf_size**: The memory size of a single operation.

> +- **dma_ring_size**: The DMA ring buffer size. Must be a power of

> +two,

> and between 64 and 4096.

> +- **kick_batch**: The DMA operation batch size, should be greater

> +than 1

> normally.

> +- **src_numa_node**: Controls the NUMA node where the source memory

> is allocated.

> +- **dst_numa_node**: Controls the NUMA node where the destination

> memory is allocated.

> +- **cache_flush**: Determines whether the cache should be flushed.

> +`1`

> indicates to flush and `0` to not flush.

> +- **test_seconds**: Controls the test time for each scenario.

> +- **lcore_dma**: Specifies the lcore/DMA mapping.

> +- **lcore**: Specifies the lcore for CPU testing.

> +- **eal_args**: Specifies the EAL arguments.

> +

> +.. Note::

> +

> +         The mapping of lcore to DMA must be one-to-one and cannot be

> duplicated.

> +

> +To specify a configuration file, use the "\-\-config" flag followed

> +by the path

> to the file.

> +

> +To specify a result file, use the "\-\-result" flag followed by the

> +path to the file. If you do not specify a result file, one will be

> +generated with the same name as the configuration file, with the

> +addition

> of "_result.csv" at the end.

> +

> +

> +Running the Application

> +-----------------------

> +

> +Typical command-line invocation to execute the application:

> +

> +.. code-block:: console

> +

> +   dpdk-test-dma-perf --config=./config_dma.ini

> + --result=./res_dma.csv

> +

> +Where `config_dma.ini` is the configuration file, and `res_dma.csv`

> +will be the generated result file.

> +

> +After the tests, you can find the results in the `res_dma.csv` file.

> +

> +Limitations

> +-----------

> +

> +Currently, this tool only supports memory copy performance tests.

> +Additional enhancements are possible in the future to support more

> +types

> of tests for DMA devices and CPUs.

> diff --git a/doc/guides/tools/index.rst b/doc/guides/tools/index.rst

> index

> 6f84fc31ff..857572da96 100644

> --- a/doc/guides/tools/index.rst

> +++ b/doc/guides/tools/index.rst

> @@ -23,3 +23,4 @@ DPDK Tools User Guides

>      testregex

>      testmldev

>      dts

> +    dmaperf

> --

> 2.40.1

>

>

>

> End of dev Digest, Vol 462, Issue 27

> ************************************

[-- Attachment #2: Type: text/html, Size: 228096 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2023-07-07  9:56 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-20  7:22 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
2023-05-17  6:16 ` [PATCH v2] " Cheng Jiang
2023-05-17  7:31 ` [PATCH v3] " Cheng Jiang
2023-06-08  5:03 ` [PATCH v4] " Cheng Jiang
2023-06-08  8:27   ` Xia, Chenbo
2023-06-08  8:38     ` Jiang, Cheng1
2023-06-08  8:43 ` [PATCH v5] " Cheng Jiang
2023-06-09 11:44   ` [EXT] " Anoob Joseph
2023-06-12  7:40     ` Jiang, Cheng1
2023-06-09 14:03   ` Amit Prakash Shukla
2023-06-12  8:26     ` Jiang, Cheng1
2023-06-13  4:51       ` Jiang, Cheng1
2023-06-13  7:34         ` Amit Prakash Shukla
2023-06-13  4:31 ` [PATCH v6] " Cheng Jiang
2023-06-13 12:55   ` huangdengdui
2023-06-14  6:40     ` Jiang, Cheng1
2023-06-15  5:21   ` [EXT] " Anoob Joseph
2023-06-15  8:01     ` Jiang, Cheng1
2023-06-15  8:44       ` Anoob Joseph
2023-06-15 14:05         ` Jiang, Cheng1
2023-06-15 15:47           ` Anoob Joseph
2023-06-16  2:56             ` Jiang, Cheng1
2023-06-16  6:32               ` Anoob Joseph
2023-06-16  8:43                 ` Jiang, Cheng1
2023-06-16  9:48                   ` Anoob Joseph
2023-06-16 10:52                     ` Anoob Joseph
2023-06-16 15:15                       ` Jiang, Cheng1
2023-06-17  4:35                         ` Jiang, Cheng1
2023-06-19  5:48                           ` Anoob Joseph
2023-06-19  6:21                             ` Jiang, Cheng1
2023-06-18  5:34                         ` Jiang, Cheng1
2023-06-19  5:25                           ` Anoob Joseph
2023-06-19  6:17                             ` Jiang, Cheng1
2023-06-18 12:26 ` [PATCH v7] " Cheng Jiang
2023-06-20  6:53 ` [PATCH v8] " Cheng Jiang
2023-06-23  6:52   ` [EXT] " Anoob Joseph
2023-06-24 11:52     ` Jiang, Cheng1
2023-06-26  5:41       ` Anoob Joseph
2023-06-26 10:02         ` Jiang, Cheng1
2023-06-26  9:41 ` [PATCH v9] " Cheng Jiang
2023-06-28  1:20 ` [PATCH v10] " Cheng Jiang
2023-06-28  4:42   ` [EXT] " Anoob Joseph
2023-06-28  6:06   ` Ling, WeiX
2023-06-29  9:08   ` Thomas Monjalon
2023-06-29 12:50     ` Jiang, Cheng1
2023-06-29 13:19       ` Thomas Monjalon
2023-06-29 13:24         ` Jiang, Cheng1
2023-06-29  9:38   ` Thomas Monjalon
2023-06-29 12:51     ` Jiang, Cheng1
2023-06-29 13:14 ` [PATCH v11] " Cheng Jiang
2023-07-03  8:20   ` fengchengwen
2023-07-07  9:56     ` Thomas Monjalon
2023-06-28 23:50 [PATCH v10] " Zhang, Yuying

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).