DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] app/dma-perf: introduce dma-perf application
@ 2022-12-20  1:06 Cheng Jiang
  2023-01-17  1:56 ` [PATCH v2] " Cheng Jiang
  2023-01-17 12:05 ` [PATCH v3] " Cheng Jiang
  0 siblings, 2 replies; 15+ messages in thread
From: Cheng Jiang @ 2022-12-20  1:06 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, yvonnex.yang,
	xingguang.he, Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 539 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/benchmark.h |  12 +
 app/test-dma-perf/config.ini  |  61 ++++
 app/test-dma-perf/main.c      | 419 ++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  51 ++++
 app/test-dma-perf/meson.build |  16 +
 7 files changed, 1099 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/benchmark.h
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..a060ad2725 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -28,6 +28,7 @@ apps = [
         'test-regex',
         'test-sad',
         'test-security-perf',
+        'test-dma-perf',
 ]
 
 default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API']
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..f6f7cc9ed3
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+
+#define MAX_DMA_CPL_NB 255
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+
+struct lcore_params {
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint32_t repeat_times;
+	uint16_t mpool_iter_step;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	uint8_t scenario_id;
+};
+
+struct buf_info {
+	struct rte_mbuf **array;
+	uint32_t nr_buf;
+	uint32_t buf_size;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+uint16_t dmadev_ids[MAX_WORKER_NB];
+uint32_t nb_dmadevs;
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec,
+			uint32_t repeat_times, uint32_t *memory, uint64_t *ave_cycle,
+			float *bandwidth, uint64_t *ops)
+{
+	*memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
+	*ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
+	*bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 * 1000 * 1000.0);
+	*ops = (double)p->nr_buf * repeat_times / time_sec;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, uint64_t ops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %u:\n"
+				"average cycles: %" PRIu64 ","
+				" buffer size: %u, nr_buf: %u,"
+				" memory: %uMB, frequency: %" PRIu64 ".\n",
+				lcore_id,
+				dev_id,
+				ave_cycle,
+				buf_size,
+				nr_buf,
+				memory,
+				rte_get_timer_hz());
+	else
+		printf("lcore %u\n"
+			"average cycles: %" PRIu64 ","
+			" buffer size: %u, nr_buf: %u,"
+			" memory: %uMB, frequency: %" PRIu64 ".\n",
+			lcore_id,
+			ave_cycle,
+			buf_size,
+			nr_buf,
+			memory,
+			rte_get_timer_hz());
+
+	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, ops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+			CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dev_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, ops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+			CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, ops);
+}
+
+static inline void
+cache_flush_buf(void *arg)
+{
+	char *data;
+	char *addr;
+	struct buf_info *info = arg;
+	struct rte_mbuf **srcs = info->array;
+	uint32_t i, k;
+
+	for (i = 0; i < info->nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (k = 0; k < info->buf_size / 64; k++) {
+			addr = (k * 64 + data);
+			__builtin_ia32_clflush(addr);
+		}
+	}
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
+		printf("Error with queue configuration\n");
+		rte_panic();
+	}
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1) {
+		printf("Error, no configured queues reported on device id %u\n", dev_id);
+		rte_panic();
+	}
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+
+static int
+config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
+{
+	int16_t dev_id = rte_dma_next_dev(0);
+	uint32_t i;
+
+	nb_dmadevs = 0;
+
+	for (i = 0; i < nb_workers; i++) {
+		if (dev_id == -1)
+			goto end;
+
+		dmadev_ids[i] = dev_id;
+		configure_dmadev_queue(dmadev_ids[i], ring_size);
+		dev_id = rte_dma_next_dev(dev_id + 1);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static inline void
+do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, uint32_t buf_size,
+			uint16_t mpool_iter_step, struct rte_mbuf **srcs, struct rte_mbuf **dsts)
+{
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t index;
+	uint16_t offset;
+	uint32_t i;
+
+	for (offset = 0; offset < mpool_iter_step; offset++) {
+		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						srcs[index]->buf_iova + srcs[index]->data_off,
+						dsts[index]->buf_iova + dsts[index]->data_off,
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						srcs[index]->buf_iova + srcs[index]->data_off,
+						dsts[index]->buf_iova + dsts[index]->data_off,
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", index);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			/**
+			 * When '&' is used to wrap an index, mask must be a power of 2.
+			 * That is, kick_batch must be 2^n.
+			 */
+			if (unlikely((async_cnt % kick_batch) == 0)) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+			}
+		}
+
+		rte_dma_submit(dev_id, 0);
+		while (async_cnt > 0) {
+			nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+			async_cnt -= nr_cpl;
+		}
+	}
+}
+
+static int
+dma_mem_copy(void *p)
+{
+	uint64_t ops;
+	uint32_t memory;
+	float bandwidth;
+	double time_sec;
+	uint32_t lcore_id = rte_lcore_id();
+	struct lcore_params *params = (struct lcore_params *)p;
+	uint32_t repeat_times = params->repeat_times;
+	uint32_t buf_size = params->buf_size;
+	uint16_t kick_batch = params->kick_batch;
+	uint32_t lcore_nr_buf = params->nr_buf;
+	uint16_t dev_id = params->dev_id;
+	uint16_t mpool_iter_step = params->mpool_iter_step;
+	struct rte_mbuf **srcs = params->srcs;
+	struct rte_mbuf **dsts = params->dsts;
+	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+	uint32_t r;
+
+	begin = rte_rdtsc();
+
+	for (r = 0; r < repeat_times; r++)
+		do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size,
+			mpool_iter_step, srcs, dsts);
+
+	end = rte_rdtsc();
+	total_cycles = end - begin;
+	time_sec = (double)total_cycles / rte_get_timer_hz();
+
+	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+			&avg_cycles, &bandwidth, &ops);
+	output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, buf_size, lcore_nr_buf,
+			memory, bandwidth, ops, true);
+
+	rte_free(p);
+
+	return 0;
+}
+
+static int
+cpu_mem_copy(void *p)
+{
+	uint32_t idx;
+	uint32_t lcore_id;
+	uint32_t memory;
+	uint64_t ops;
+	float bandwidth;
+	double time_sec;
+	struct lcore_params *params = (struct lcore_params *)p;
+	uint32_t repeat_times = params->repeat_times;
+	uint32_t buf_size = params->buf_size;
+	uint32_t lcore_nr_buf = params->nr_buf;
+	uint16_t mpool_iter_step = params->mpool_iter_step;
+	struct rte_mbuf **srcs = params->srcs;
+	struct rte_mbuf **dsts = params->dsts;
+	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+	uint32_t k, j, offset;
+
+	begin = rte_rdtsc();
+
+	for (k = 0; k < repeat_times; k++) {
+		/* copy buffer form src to dst */
+		for (offset = 0; offset < mpool_iter_step; offset++) {
+			for (j = 0; idx = j * mpool_iter_step + offset, idx < lcore_nr_buf; j++) {
+				rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
+					(void *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
+					(size_t)buf_size);
+			}
+		}
+	}
+
+	end = rte_rdtsc();
+	total_cycles = end - begin;
+	time_sec = (double)total_cycles / rte_get_timer_hz();
+
+	lcore_id = rte_lcore_id();
+
+	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+			&avg_cycles, &bandwidth, &ops);
+	output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, lcore_nr_buf,
+			memory, bandwidth, ops, false);
+
+	rte_free(p);
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	uint32_t i;
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_buf; i++) {
+		(*srcs)[i] = rte_pktmbuf_alloc(src_pool);
+		(*dsts)[i] = rte_pktmbuf_alloc(dst_pool);
+		if ((!(*srcs)[i]) || (!(*dsts)[i])) {
+			printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
+			return -1;
+		}
+
+		(*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
+		(*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;
+	}
+
+	return 0;
+}
+
+void
+dma_mem_copy_benchmark(struct test_configure *cfg)
+{
+	uint32_t i;
+	uint32_t offset;
+	unsigned int lcore_id  = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint16_t mpool_iter_step = cfg->mpool_iter_step;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint32_t repeat_times = cfg->repeat_times;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
+		goto out;
+
+	if (cfg->cache_flush) {
+		struct buf_info info;
+
+		info.array = srcs;
+		info.buf_size = buf_size;
+		info.nr_buf = nr_buf;
+		cache_flush_buf(&info);
+
+		info.array = dsts;
+		cache_flush_buf(&info);
+		__builtin_ia32_mfence();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		offset = nr_buf / nb_workers * i;
+
+		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+		if (!p) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		*p = (struct lcore_params) {
+			dmadev_ids[i],
+			(uint32_t)(nr_buf/nb_workers),
+			kick_batch,
+			buf_size,
+			repeat_times,
+			mpool_iter_step,
+			srcs + offset,
+			dsts + offset,
+			cfg->scenario_id
+		};
+
+		rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+out:
+	/* free env */
+	if (srcs) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(srcs[i]);
+		free(srcs);
+	}
+	if (dsts) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(dsts[i]);
+		free(dsts);
+	}
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	for (i = 0; i < nb_dmadevs; i++) {
+		printf("Stopping dmadev %d\n", dmadev_ids[i]);
+		rte_dma_stop(dmadev_ids[i]);
+	}
+}
+
+void
+cpu_mem_copy_benchmark(struct test_configure *cfg)
+{
+	uint32_t i, offset;
+	uint32_t repeat_times = cfg->repeat_times;
+	uint32_t kick_batch = cfg->kick_batch.cur;
+	uint32_t buf_size = cfg->buf_size.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint16_t mpool_iter_step = cfg->mpool_iter_step;
+	struct rte_mbuf **srcs  = NULL, **dsts  = NULL;
+	unsigned int lcore_id = 0;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? 1 : 0, 1);
+		offset = nr_buf / nb_workers * i;
+		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+		if (!p) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		*p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch,
+						buf_size, repeat_times, mpool_iter_step,
+						srcs + offset, dsts + offset, cfg->scenario_id };
+		rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+out:
+	/* free env */
+	if (srcs) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(srcs[i]);
+		free(srcs);
+	}
+	if (dsts) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(dsts[i]);
+		free(dsts);
+	}
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+}
diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h
new file mode 100644
index 0000000000..f5ad8d6d99
--- /dev/null
+++ b/app/test-dma-perf/benchmark.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _BENCHMARK_H_
+#define _BENCHMARK_H_
+
+void dma_mem_copy_benchmark(struct test_configure *cfg);
+
+void cpu_mem_copy_benchmark(struct test_configure *cfg);
+
+#endif /* _BENCHMARK_H_ */
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..e24bb19414
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; Supported test types:
+; DMA_MEM_COPY|CPU_MEM_COPY
+
+; Parameters:
+; "mem_size","buf_size","dma_ring_size","kick_batch".
+; "mem_size" means the size of the memory footprint.
+; "buf_size" means the memory size of a single operation.
+; "dma_ring_size" means the dma ring buffer size.
+; "kick_batch" means dma operation batch size.
+
+; Format: variable=first[,last,increment[,ADD|MUL]]
+; ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to control if the cache should be flushed.
+
+; repeat_times is used to control the repeat times of the whole case.
+
+; worker_threads is used to control the threads number of the test app.
+; It should be less than the core number.
+
+; mpool_iter_step is used to control the buffer continuity.
+
+; Bind DMA to lcore:
+; Specify the "lcore_dma" parameter.
+; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
+; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
+; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
+; their range.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+repeat_times=10
+worker_threads=1
+mpool_iter_step=1
+lcore_dma=lcore3@0000:00:04.0
+eal_args=--legacy-mem --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+repeat_times=100
+worker_threads=1
+mpool_iter_step=1
+eal_args=--no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..6a67c0ff83
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),OPS\n"
+
+#define MAX_EAL_PARM_NB 100
+#define MAX_EAL_PARM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, "%s", ",,,,,,,,\n");
+		fprintf(fd, "%s", ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		dma_mem_copy_benchmark(case_cfg);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		cpu_mem_copy_benchmark(case_cfg);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry *var_entry = NULL;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->nb_workers) {
+		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
+			case_id, nb_lcores, case_cfg->nb_workers);
+		return;
+	}
+
+	RTE_LOG(INFO, DMA, "Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	if (var_entry) {
+		for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+			case_cfg->scenario_id++;
+			printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+			run_test_case(case_cfg);
+			output_csv(false);
+
+			if (var_entry->op == OP_MUL)
+				var_entry->cur *= var_entry->incr;
+			else
+				var_entry->cur += var_entry->incr;
+
+
+		}
+	} else {
+		run_test_case(case_cfg);
+		output_csv(false);
+	}
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else
+			entry->op = OP_ADD;
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static void
+load_configs(void)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char **sections_name;
+	const char *section_name, *case_type;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+
+	sections_name = malloc(MAX_TEST_CASES * sizeof(char *));
+	for (i = 0; i < MAX_TEST_CASES; i++)
+		sections_name[i] = malloc(CFG_NAME_LEN * sizeof(char *));
+
+	cfgfile = rte_cfgfile_load("./config.ini", 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+	rte_cfgfile_sections(cfgfile, sections_name, MAX_TEST_CASES);
+	for (i = 0; i < nb_sections; i++) {
+		test_case = &test_cases[i];
+		section_name = sections_name[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d\n.", i + 1);
+			exit(1);
+		}
+		if (!strcmp(case_type, DMA_MEM_COPY)) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+		} else if (!strcmp(case_type, CPU_MEM_COPY)) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+		} else {
+			printf("Error: Cannot find case type %s.\n", case_type);
+			exit(1);
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "dma_ring_size");
+		args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+		args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (nb_vp > 2) {
+			printf("%s, variable parameters can only have one.\n", section_name);
+			break;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->repeat_times =
+			(uint32_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "repeat_times"));
+		test_case->nb_workers =
+			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "worker_threads"));
+		test_case->mpool_iter_step =
+			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "mpool_iter_step"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+	}
+
+	rte_cfgfile_close(cfgfile);
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (sections_name[i] != NULL)
+			free(sections_name[i]);
+	}
+	free(sections_name);
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARM_NB];
+	char args[MAX_EAL_PARM_LEN] = {0};
+	int new_argc, token_nb;
+
+	new_argc = argc;
+
+	for (i = 0; i < argc; i++)
+		strcpy(new_argv[i], argv[i]);
+
+	if (eal_args) {
+		strcpy(args, eal_args);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARM_NB][MAX_EAL_PARM_LEN];
+	char *pargs[100];
+	int new_argc;
+
+
+	memset(args, 0, sizeof(args));
+	for (i = 0; i < 100; i++)
+		pargs[i] = args[i];
+
+	load_configs();
+	fd = fopen("./test_result.csv", "w");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 0;
+	}
+	fclose(fd);
+
+	/* loop each case, run it */
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (test_cases[i].test_type != TEST_TYPE_NONE) {
+			cpid = fork();
+			if (cpid < 0) {
+				printf("Fork case %d failed.\n", i + 1);
+				exit(EXIT_FAILURE);
+			} else if (cpid == 0) {
+				printf("\nRunning case %u\n", i + 1);
+
+				if (test_cases[i].eal_args) {
+					new_argc = append_eal_args(argc, argv,
+						test_cases[i].eal_args, pargs);
+
+					ret = rte_eal_init(new_argc, pargs);
+				} else {
+					ret = rte_eal_init(argc, argv);
+				}
+				if (ret < 0)
+					rte_exit(EXIT_FAILURE, "Invalied EAL arguments\n");
+
+				/* Check lcores. */
+				nb_lcores = rte_lcore_count();
+				if (nb_lcores < 2)
+					rte_exit(EXIT_FAILURE,
+						"There should be at least 2 worker lcores.\n");
+
+				fd = fopen("./test_result.csv", "a");
+				if (!fd) {
+					printf("Open output CSV file error.\n");
+					return 0;
+				}
+
+				if (i == 0)
+					output_env_info();
+				run_test(i + 1, &test_cases[i]);
+
+				/* clean up the EAL */
+				rte_eal_cleanup();
+
+				fclose(fd);
+
+				printf("\nCase %u completed.\n", i + 1);
+
+				exit(EXIT_SUCCESS);
+			} else {
+				wpid = waitpid(cpid, &wstatus, 0);
+				if (wpid == -1) {
+					printf("waitpid error.\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (WIFEXITED(wstatus))
+					printf("Case process exited. status %d\n",
+						WEXITSTATUS(wstatus));
+				else if (WIFSIGNALED(wstatus))
+					printf("Case process killed by signal %d\n",
+						WTERMSIG(wstatus));
+				else if (WIFSTOPPED(wstatus))
+					printf("Case process stopped by signal %d\n",
+						WSTOPSIG(wstatus));
+				else if (WIFCONTINUED(wstatus))
+					printf("Case process continued.\n");
+				else
+					printf("Case process unknown terminated.\n");
+			}
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..f43b0c5d31
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct test_configure {
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint32_t repeat_times;
+	uint32_t nb_workers;
+	uint16_t mpool_iter_step;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..931df6ed54
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2022 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+allow_experimental_apis = true
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
-- 
2.35.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2] app/dma-perf: introduce dma-perf application
  2022-12-20  1:06 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
@ 2023-01-17  1:56 ` Cheng Jiang
  2023-01-17 13:00   ` Bruce Richardson
  2023-01-17 12:05 ` [PATCH v3] " Cheng Jiang
  1 sibling, 1 reply; 15+ messages in thread
From: Cheng Jiang @ 2023-01-17  1:56 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
v2: fixed some CI issues.

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 539 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/benchmark.h |  12 +
 app/test-dma-perf/config.ini  |  61 ++++
 app/test-dma-perf/main.c      | 434 +++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  53 ++++
 app/test-dma-perf/meson.build |  22 ++
 7 files changed, 1122 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/benchmark.h
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..a060ad2725 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -28,6 +28,7 @@ apps = [
         'test-regex',
         'test-sad',
         'test-security-perf',
+        'test-dma-perf',
 ]

 default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API']
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..1cb5b0b291
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+
+#define MAX_DMA_CPL_NB 255
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+
+struct lcore_params {
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint32_t repeat_times;
+	uint16_t mpool_iter_step;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	uint8_t scenario_id;
+};
+
+struct buf_info {
+	struct rte_mbuf **array;
+	uint32_t nr_buf;
+	uint32_t buf_size;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+uint16_t dmadev_ids[MAX_WORKER_NB];
+uint32_t nb_dmadevs;
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec,
+			uint32_t repeat_times, uint32_t *memory, uint64_t *ave_cycle,
+			float *bandwidth, uint64_t *ops)
+{
+	*memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
+	*ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
+	*bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 * 1000 * 1000.0);
+	*ops = (double)p->nr_buf * repeat_times / time_sec;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, uint64_t ops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %u:\n"
+				"average cycles: %" PRIu64 ","
+				" buffer size: %u, nr_buf: %u,"
+				" memory: %uMB, frequency: %" PRIu64 ".\n",
+				lcore_id,
+				dev_id,
+				ave_cycle,
+				buf_size,
+				nr_buf,
+				memory,
+				rte_get_timer_hz());
+	else
+		printf("lcore %u\n"
+			"average cycles: %" PRIu64 ","
+			" buffer size: %u, nr_buf: %u,"
+			" memory: %uMB, frequency: %" PRIu64 ".\n",
+			lcore_id,
+			ave_cycle,
+			buf_size,
+			nr_buf,
+			memory,
+			rte_get_timer_hz());
+
+	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, ops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+			CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dev_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, ops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+			CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, ops);
+}
+
+static inline void
+cache_flush_buf(void *arg)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	char *addr;
+	struct buf_info *info = arg;
+	struct rte_mbuf **srcs = info->array;
+	uint32_t i, k;
+
+	for (i = 0; i < info->nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (k = 0; k < info->buf_size / 64; k++) {
+			addr = (k * 64 + data);
+			__builtin_ia32_clflush(addr);
+		}
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
+		printf("Error with queue configuration\n");
+		rte_panic();
+	}
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1) {
+		printf("Error, no configured queues reported on device id %u\n", dev_id);
+		rte_panic();
+	}
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+
+static int
+config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
+{
+	int16_t dev_id = rte_dma_next_dev(0);
+	uint32_t i;
+
+	nb_dmadevs = 0;
+
+	for (i = 0; i < nb_workers; i++) {
+		if (dev_id == -1)
+			goto end;
+
+		dmadev_ids[i] = dev_id;
+		configure_dmadev_queue(dmadev_ids[i], ring_size);
+		dev_id = rte_dma_next_dev(dev_id + 1);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static inline void
+do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, uint32_t buf_size,
+			uint16_t mpool_iter_step, struct rte_mbuf **srcs, struct rte_mbuf **dsts)
+{
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t index;
+	uint16_t offset;
+	uint32_t i;
+
+	for (offset = 0; offset < mpool_iter_step; offset++) {
+		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						srcs[index]->buf_iova + srcs[index]->data_off,
+						dsts[index]->buf_iova + dsts[index]->data_off,
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						srcs[index]->buf_iova + srcs[index]->data_off,
+						dsts[index]->buf_iova + dsts[index]->data_off,
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", index);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			/**
+			 * When '&' is used to wrap an index, mask must be a power of 2.
+			 * That is, kick_batch must be 2^n.
+			 */
+			if (unlikely((async_cnt % kick_batch) == 0)) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+			}
+		}
+
+		rte_dma_submit(dev_id, 0);
+		while (async_cnt > 0) {
+			nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+			async_cnt -= nr_cpl;
+		}
+	}
+}
+
+static int
+dma_mem_copy(void *p)
+{
+	uint64_t ops;
+	uint32_t memory;
+	float bandwidth;
+	double time_sec;
+	uint32_t lcore_id = rte_lcore_id();
+	struct lcore_params *params = (struct lcore_params *)p;
+	uint32_t repeat_times = params->repeat_times;
+	uint32_t buf_size = params->buf_size;
+	uint16_t kick_batch = params->kick_batch;
+	uint32_t lcore_nr_buf = params->nr_buf;
+	uint16_t dev_id = params->dev_id;
+	uint16_t mpool_iter_step = params->mpool_iter_step;
+	struct rte_mbuf **srcs = params->srcs;
+	struct rte_mbuf **dsts = params->dsts;
+	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+	uint32_t r;
+
+	begin = rte_rdtsc();
+
+	for (r = 0; r < repeat_times; r++)
+		do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size,
+			mpool_iter_step, srcs, dsts);
+
+	end = rte_rdtsc();
+	total_cycles = end - begin;
+	time_sec = (double)total_cycles / rte_get_timer_hz();
+
+	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+			&avg_cycles, &bandwidth, &ops);
+	output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, buf_size, lcore_nr_buf,
+			memory, bandwidth, ops, true);
+
+	rte_free(p);
+
+	return 0;
+}
+
+static int
+cpu_mem_copy(void *p)
+{
+	uint32_t idx;
+	uint32_t lcore_id;
+	uint32_t memory;
+	uint64_t ops;
+	float bandwidth;
+	double time_sec;
+	struct lcore_params *params = (struct lcore_params *)p;
+	uint32_t repeat_times = params->repeat_times;
+	uint32_t buf_size = params->buf_size;
+	uint32_t lcore_nr_buf = params->nr_buf;
+	uint16_t mpool_iter_step = params->mpool_iter_step;
+	struct rte_mbuf **srcs = params->srcs;
+	struct rte_mbuf **dsts = params->dsts;
+	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+	uint32_t k, j, offset;
+
+	begin = rte_rdtsc();
+
+	for (k = 0; k < repeat_times; k++) {
+		/* copy buffer form src to dst */
+		for (offset = 0; offset < mpool_iter_step; offset++) {
+			for (j = 0; idx = j * mpool_iter_step + offset, idx < lcore_nr_buf; j++) {
+				rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
+					(void *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
+					(size_t)buf_size);
+			}
+		}
+	}
+
+	end = rte_rdtsc();
+	total_cycles = end - begin;
+	time_sec = (double)total_cycles / rte_get_timer_hz();
+
+	lcore_id = rte_lcore_id();
+
+	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+			&avg_cycles, &bandwidth, &ops);
+	output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, lcore_nr_buf,
+			memory, bandwidth, ops, false);
+
+	rte_free(p);
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	uint32_t i;
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_buf; i++) {
+		(*srcs)[i] = rte_pktmbuf_alloc(src_pool);
+		(*dsts)[i] = rte_pktmbuf_alloc(dst_pool);
+		if ((!(*srcs)[i]) || (!(*dsts)[i])) {
+			printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
+			return -1;
+		}
+
+		(*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
+		(*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;
+	}
+
+	return 0;
+}
+
+void
+dma_mem_copy_benchmark(struct test_configure *cfg)
+{
+	uint32_t i;
+	uint32_t offset;
+	unsigned int lcore_id  = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint16_t mpool_iter_step = cfg->mpool_iter_step;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint32_t repeat_times = cfg->repeat_times;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
+		goto out;
+
+	if (cfg->cache_flush) {
+		struct buf_info info;
+
+		info.array = srcs;
+		info.buf_size = buf_size;
+		info.nr_buf = nr_buf;
+		cache_flush_buf(&info);
+
+		info.array = dsts;
+		cache_flush_buf(&info);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		offset = nr_buf / nb_workers * i;
+
+		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+		if (!p) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		*p = (struct lcore_params) {
+			dmadev_ids[i],
+			(uint32_t)(nr_buf/nb_workers),
+			kick_batch,
+			buf_size,
+			repeat_times,
+			mpool_iter_step,
+			srcs + offset,
+			dsts + offset,
+			cfg->scenario_id
+		};
+
+		rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+out:
+	/* free env */
+	if (srcs) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(srcs[i]);
+		free(srcs);
+	}
+	if (dsts) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(dsts[i]);
+		free(dsts);
+	}
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	for (i = 0; i < nb_dmadevs; i++) {
+		printf("Stopping dmadev %d\n", dmadev_ids[i]);
+		rte_dma_stop(dmadev_ids[i]);
+	}
+}
+
+void
+cpu_mem_copy_benchmark(struct test_configure *cfg)
+{
+	uint32_t i, offset;
+	uint32_t repeat_times = cfg->repeat_times;
+	uint32_t kick_batch = cfg->kick_batch.cur;
+	uint32_t buf_size = cfg->buf_size.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint16_t mpool_iter_step = cfg->mpool_iter_step;
+	struct rte_mbuf **srcs  = NULL, **dsts  = NULL;
+	unsigned int lcore_id = 0;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? 1 : 0, 1);
+		offset = nr_buf / nb_workers * i;
+		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+		if (!p) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		*p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch,
+						buf_size, repeat_times, mpool_iter_step,
+						srcs + offset, dsts + offset, cfg->scenario_id };
+		rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+out:
+	/* free env */
+	if (srcs) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(srcs[i]);
+		free(srcs);
+	}
+	if (dsts) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(dsts[i]);
+		free(dsts);
+	}
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+}
diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h
new file mode 100644
index 0000000000..f5ad8d6d99
--- /dev/null
+++ b/app/test-dma-perf/benchmark.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _BENCHMARK_H_
+#define _BENCHMARK_H_
+
+void dma_mem_copy_benchmark(struct test_configure *cfg);
+
+void cpu_mem_copy_benchmark(struct test_configure *cfg);
+
+#endif /* _BENCHMARK_H_ */
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..e24bb19414
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; Supported test types:
+; DMA_MEM_COPY|CPU_MEM_COPY
+
+; Parameters:
+; "mem_size","buf_size","dma_ring_size","kick_batch".
+; "mem_size" means the size of the memory footprint.
+; "buf_size" means the memory size of a single operation.
+; "dma_ring_size" means the dma ring buffer size.
+; "kick_batch" means dma operation batch size.
+
+; Format: variable=first[,last,increment[,ADD|MUL]]
+; ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to control if the cache should be flushed.
+
+; repeat_times is used to control the repeat times of the whole case.
+
+; worker_threads is used to control the threads number of the test app.
+; It should be less than the core number.
+
+; mpool_iter_step is used to control the buffer continuity.
+
+; Bind DMA to lcore:
+; Specify the "lcore_dma" parameter.
+; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
+; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
+; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
+; their range.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+repeat_times=10
+worker_threads=1
+mpool_iter_step=1
+lcore_dma=lcore3@0000:00:04.0
+eal_args=--legacy-mem --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+repeat_times=100
+worker_threads=1
+mpool_iter_step=1
+eal_args=--no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..94ba369539
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,434 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdio.h>
+#if !defined(RTE_EXEC_ENV_LINUX)
+
+int
+main(int argc, char *argv[])
+{
+	printf("OS not supported, skipping test\n");
+	return 0;
+}
+
+#else
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),OPS\n"
+
+#define MAX_EAL_PARM_NB 100
+#define MAX_EAL_PARM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, "%s", ",,,,,,,,\n");
+		fprintf(fd, "%s", ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		dma_mem_copy_benchmark(case_cfg);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		cpu_mem_copy_benchmark(case_cfg);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry *var_entry = NULL;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->nb_workers) {
+		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
+			case_id, nb_lcores, case_cfg->nb_workers);
+		return;
+	}
+
+	RTE_LOG(INFO, DMA, "Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	if (var_entry) {
+		for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+			case_cfg->scenario_id++;
+			printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+			run_test_case(case_cfg);
+			output_csv(false);
+
+			if (var_entry->op == OP_MUL)
+				var_entry->cur *= var_entry->incr;
+			else
+				var_entry->cur += var_entry->incr;
+
+
+		}
+	} else {
+		run_test_case(case_cfg);
+		output_csv(false);
+	}
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else
+			entry->op = OP_ADD;
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static void
+load_configs(void)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char **sections_name;
+	const char *section_name, *case_type;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+
+	sections_name = malloc(MAX_TEST_CASES * sizeof(char *));
+	for (i = 0; i < MAX_TEST_CASES; i++)
+		sections_name[i] = malloc(CFG_NAME_LEN * sizeof(char *));
+
+	cfgfile = rte_cfgfile_load("./config.ini", 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+	rte_cfgfile_sections(cfgfile, sections_name, MAX_TEST_CASES);
+	for (i = 0; i < nb_sections; i++) {
+		test_case = &test_cases[i];
+		section_name = sections_name[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d\n.", i + 1);
+			exit(1);
+		}
+		if (!strcmp(case_type, DMA_MEM_COPY)) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+		} else if (!strcmp(case_type, CPU_MEM_COPY)) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+		} else {
+			printf("Error: Cannot find case type %s.\n", case_type);
+			exit(1);
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "dma_ring_size");
+		args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+		args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (nb_vp > 2) {
+			printf("%s, variable parameters can only have one.\n", section_name);
+			break;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->repeat_times =
+			(uint32_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "repeat_times"));
+		test_case->nb_workers =
+			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "worker_threads"));
+		test_case->mpool_iter_step =
+			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "mpool_iter_step"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+	}
+
+	rte_cfgfile_close(cfgfile);
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (sections_name[i] != NULL)
+			free(sections_name[i]);
+	}
+	free(sections_name);
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARM_NB];
+	char args[MAX_EAL_PARM_LEN] = {0};
+	int new_argc, token_nb;
+
+	new_argc = argc;
+
+	for (i = 0; i < argc; i++)
+		strcpy(new_argv[i], argv[i]);
+
+	if (eal_args) {
+		strcpy(args, eal_args);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARM_NB][MAX_EAL_PARM_LEN];
+	char *pargs[100];
+	int new_argc;
+
+
+	memset(args, 0, sizeof(args));
+	for (i = 0; i < 100; i++)
+		pargs[i] = args[i];
+
+	load_configs();
+	fd = fopen("./test_result.csv", "w");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 0;
+	}
+	fclose(fd);
+
+	/* loop each case, run it */
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (test_cases[i].test_type != TEST_TYPE_NONE) {
+			cpid = fork();
+			if (cpid < 0) {
+				printf("Fork case %d failed.\n", i + 1);
+				exit(EXIT_FAILURE);
+			} else if (cpid == 0) {
+				printf("\nRunning case %u\n", i + 1);
+
+				if (test_cases[i].eal_args) {
+					new_argc = append_eal_args(argc, argv,
+						test_cases[i].eal_args, pargs);
+
+					ret = rte_eal_init(new_argc, pargs);
+				} else {
+					ret = rte_eal_init(argc, argv);
+				}
+				if (ret < 0)
+					rte_exit(EXIT_FAILURE, "Invalied EAL arguments\n");
+
+				/* Check lcores. */
+				nb_lcores = rte_lcore_count();
+				if (nb_lcores < 2)
+					rte_exit(EXIT_FAILURE,
+						"There should be at least 2 worker lcores.\n");
+
+				fd = fopen("./test_result.csv", "a");
+				if (!fd) {
+					printf("Open output CSV file error.\n");
+					return 0;
+				}
+
+				if (i == 0)
+					output_env_info();
+				run_test(i + 1, &test_cases[i]);
+
+				/* clean up the EAL */
+				rte_eal_cleanup();
+
+				fclose(fd);
+
+				printf("\nCase %u completed.\n", i + 1);
+
+				exit(EXIT_SUCCESS);
+			} else {
+				wpid = waitpid(cpid, &wstatus, 0);
+				if (wpid == -1) {
+					printf("waitpid error.\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (WIFEXITED(wstatus))
+					printf("Case process exited. status %d\n",
+						WEXITSTATUS(wstatus));
+				else if (WIFSIGNALED(wstatus))
+					printf("Case process killed by signal %d\n",
+						WTERMSIG(wstatus));
+				else if (WIFSTOPPED(wstatus))
+					printf("Case process stopped by signal %d\n",
+						WSTOPSIG(wstatus));
+				else if (WIFCONTINUED(wstatus))
+					printf("Case process continued.\n");
+				else
+					printf("Case process unknown terminated.\n");
+			}
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
+#endif
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..78fbb64107
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct test_configure {
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint32_t repeat_times;
+	uint32_t nb_workers;
+	uint16_t mpool_iter_step;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..17ff384378
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2022 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+allow_experimental_apis = true
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.35.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v3] app/dma-perf: introduce dma-perf application
  2022-12-20  1:06 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
  2023-01-17  1:56 ` [PATCH v2] " Cheng Jiang
@ 2023-01-17 12:05 ` Cheng Jiang
  2023-01-17 15:44   ` Bruce Richardson
  2023-01-17 16:51   ` Bruce Richardson
  1 sibling, 2 replies; 15+ messages in thread
From: Cheng Jiang @ 2023-01-17 12:05 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
fixed compile issues for loongarch.
fixed compile issues for intel.
fixed coding style issues.

v2:
fixed some CI issues.

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 541 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/benchmark.h |  12 +
 app/test-dma-perf/config.ini  |  61 ++++
 app/test-dma-perf/main.c      | 434 +++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  57 ++++
 app/test-dma-perf/meson.build |  20 ++
 7 files changed, 1126 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/benchmark.h
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..a060ad2725 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -28,6 +28,7 @@ apps = [
         'test-regex',
         'test-sad',
         'test-security-perf',
+        'test-dma-perf',
 ]

 default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API']
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..7ec3f95643
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+
+#define MAX_DMA_CPL_NB 255
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+
+struct lcore_params {
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint32_t repeat_times;
+	uint16_t mpool_iter_step;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	uint8_t scenario_id;
+};
+
+struct buf_info {
+	struct rte_mbuf **array;
+	uint32_t nr_buf;
+	uint32_t buf_size;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+uint16_t dmadev_ids[MAX_WORKER_NB];
+uint32_t nb_dmadevs;
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec,
+			uint32_t repeat_times, uint32_t *memory, uint64_t *ave_cycle,
+			float *bandwidth, uint64_t *ops)
+{
+	*memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
+	*ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
+	*bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 * 1000 * 1000.0);
+	*ops = (double)p->nr_buf * repeat_times / time_sec;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, uint64_t ops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %u:\n"
+				"average cycles: %" PRIu64 ","
+				" buffer size: %u, nr_buf: %u,"
+				" memory: %uMB, frequency: %" PRIu64 ".\n",
+				lcore_id,
+				dev_id,
+				ave_cycle,
+				buf_size,
+				nr_buf,
+				memory,
+				rte_get_timer_hz());
+	else
+		printf("lcore %u\n"
+			"average cycles: %" PRIu64 ","
+			" buffer size: %u, nr_buf: %u,"
+			" memory: %uMB, frequency: %" PRIu64 ".\n",
+			lcore_id,
+			ave_cycle,
+			buf_size,
+			nr_buf,
+			memory,
+			rte_get_timer_hz());
+
+	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, ops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+			CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dev_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, ops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+			CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, ops);
+}
+
+static inline void
+cache_flush_buf(void *arg __maybe_unused)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	char *addr;
+	struct buf_info *info = arg;
+	struct rte_mbuf **srcs = info->array;
+	uint32_t i, k;
+
+	for (i = 0; i < info->nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (k = 0; k < info->buf_size / 64; k++) {
+			addr = (k * 64 + data);
+			__builtin_ia32_clflush(addr);
+		}
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
+		printf("Error with queue configuration\n");
+		rte_panic();
+	}
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1) {
+		printf("Error, no configured queues reported on device id %u\n", dev_id);
+		rte_panic();
+	}
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+
+static int
+config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
+{
+	int16_t dev_id = rte_dma_next_dev(0);
+	uint32_t i;
+
+	nb_dmadevs = 0;
+
+	for (i = 0; i < nb_workers; i++) {
+		if (dev_id == -1)
+			goto end;
+
+		dmadev_ids[i] = dev_id;
+		configure_dmadev_queue(dmadev_ids[i], ring_size);
+		dev_id = rte_dma_next_dev(dev_id + 1);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+static inline void
+do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, uint32_t buf_size,
+			uint16_t mpool_iter_step, struct rte_mbuf **srcs, struct rte_mbuf **dsts)
+{
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t index;
+	uint16_t offset;
+	uint32_t i;
+
+	for (offset = 0; offset < mpool_iter_step; offset++) {
+		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						srcs[index]->buf_iova + srcs[index]->data_off,
+						dsts[index]->buf_iova + dsts[index]->data_off,
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						srcs[index]->buf_iova + srcs[index]->data_off,
+						dsts[index]->buf_iova + dsts[index]->data_off,
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", index);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			/**
+			 * When '&' is used to wrap an index, mask must be a power of 2.
+			 * That is, kick_batch must be 2^n.
+			 */
+			if (unlikely((async_cnt % kick_batch) == 0)) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+			}
+		}
+
+		rte_dma_submit(dev_id, 0);
+		while (async_cnt > 0) {
+			nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+			async_cnt -= nr_cpl;
+		}
+	}
+}
+
+static int
+dma_mem_copy(void *p)
+{
+	uint64_t ops;
+	uint32_t memory;
+	float bandwidth;
+	double time_sec;
+	uint32_t lcore_id = rte_lcore_id();
+	struct lcore_params *params = (struct lcore_params *)p;
+	uint32_t repeat_times = params->repeat_times;
+	uint32_t buf_size = params->buf_size;
+	uint16_t kick_batch = params->kick_batch;
+	uint32_t lcore_nr_buf = params->nr_buf;
+	uint16_t dev_id = params->dev_id;
+	uint16_t mpool_iter_step = params->mpool_iter_step;
+	struct rte_mbuf **srcs = params->srcs;
+	struct rte_mbuf **dsts = params->dsts;
+	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+	uint32_t r;
+
+	begin = rte_rdtsc();
+
+	for (r = 0; r < repeat_times; r++)
+		do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size,
+			mpool_iter_step, srcs, dsts);
+
+	end = rte_rdtsc();
+	total_cycles = end - begin;
+	time_sec = (double)total_cycles / rte_get_timer_hz();
+
+	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+			&avg_cycles, &bandwidth, &ops);
+	output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, buf_size, lcore_nr_buf,
+			memory, bandwidth, ops, true);
+
+	rte_free(p);
+
+	return 0;
+}
+
+static int
+cpu_mem_copy(void *p)
+{
+	uint32_t idx;
+	uint32_t lcore_id;
+	uint32_t memory;
+	uint64_t ops;
+	float bandwidth;
+	double time_sec;
+	struct lcore_params *params = (struct lcore_params *)p;
+	uint32_t repeat_times = params->repeat_times;
+	uint32_t buf_size = params->buf_size;
+	uint32_t lcore_nr_buf = params->nr_buf;
+	uint16_t mpool_iter_step = params->mpool_iter_step;
+	struct rte_mbuf **srcs = params->srcs;
+	struct rte_mbuf **dsts = params->dsts;
+	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+	uint32_t k, j, offset;
+
+	begin = rte_rdtsc();
+
+	for (k = 0; k < repeat_times; k++) {
+		/* copy buffer form src to dst */
+		for (offset = 0; offset < mpool_iter_step; offset++) {
+			for (j = 0; idx = j * mpool_iter_step + offset, idx < lcore_nr_buf; j++) {
+				rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
+					(void *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
+					(size_t)buf_size);
+			}
+		}
+	}
+
+	end = rte_rdtsc();
+	total_cycles = end - begin;
+	time_sec = (double)total_cycles / rte_get_timer_hz();
+
+	lcore_id = rte_lcore_id();
+
+	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+			&avg_cycles, &bandwidth, &ops);
+	output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, lcore_nr_buf,
+			memory, bandwidth, ops, false);
+
+	rte_free(p);
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	uint32_t i;
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_buf; i++) {
+		(*srcs)[i] = rte_pktmbuf_alloc(src_pool);
+		(*dsts)[i] = rte_pktmbuf_alloc(dst_pool);
+		if ((!(*srcs)[i]) || (!(*dsts)[i])) {
+			printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
+			return -1;
+		}
+
+		(*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
+		(*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;
+	}
+
+	return 0;
+}
+
+void
+dma_mem_copy_benchmark(struct test_configure *cfg)
+{
+	uint32_t i;
+	uint32_t offset;
+	unsigned int lcore_id  = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint16_t mpool_iter_step = cfg->mpool_iter_step;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint32_t repeat_times = cfg->repeat_times;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
+		goto out;
+
+	if (cfg->cache_flush) {
+		struct buf_info info;
+
+		info.array = srcs;
+		info.buf_size = buf_size;
+		info.nr_buf = nr_buf;
+		cache_flush_buf(&info);
+
+		info.array = dsts;
+		cache_flush_buf(&info);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		offset = nr_buf / nb_workers * i;
+
+		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+		if (!p) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		*p = (struct lcore_params) {
+			dmadev_ids[i],
+			(uint32_t)(nr_buf/nb_workers),
+			kick_batch,
+			buf_size,
+			repeat_times,
+			mpool_iter_step,
+			srcs + offset,
+			dsts + offset,
+			cfg->scenario_id
+		};
+
+		rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+out:
+	/* free env */
+	if (srcs) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(srcs[i]);
+		free(srcs);
+	}
+	if (dsts) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(dsts[i]);
+		free(dsts);
+	}
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	for (i = 0; i < nb_dmadevs; i++) {
+		printf("Stopping dmadev %d\n", dmadev_ids[i]);
+		rte_dma_stop(dmadev_ids[i]);
+	}
+}
+
+void
+cpu_mem_copy_benchmark(struct test_configure *cfg)
+{
+	uint32_t i, offset;
+	uint32_t repeat_times = cfg->repeat_times;
+	uint32_t kick_batch = cfg->kick_batch.cur;
+	uint32_t buf_size = cfg->buf_size.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint16_t mpool_iter_step = cfg->mpool_iter_step;
+	struct rte_mbuf **srcs  = NULL, **dsts  = NULL;
+	unsigned int lcore_id = 0;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? 1 : 0, 1);
+		offset = nr_buf / nb_workers * i;
+		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+		if (!p) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		*p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch,
+						buf_size, repeat_times, mpool_iter_step,
+						srcs + offset, dsts + offset, cfg->scenario_id };
+		rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+out:
+	/* free env */
+	if (srcs) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(srcs[i]);
+		free(srcs);
+	}
+	if (dsts) {
+		for (i = 0; i < nr_buf; i++)
+			rte_pktmbuf_free(dsts[i]);
+		free(dsts);
+	}
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+}
diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h
new file mode 100644
index 0000000000..f5ad8d6d99
--- /dev/null
+++ b/app/test-dma-perf/benchmark.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _BENCHMARK_H_
+#define _BENCHMARK_H_
+
+void dma_mem_copy_benchmark(struct test_configure *cfg);
+
+void cpu_mem_copy_benchmark(struct test_configure *cfg);
+
+#endif /* _BENCHMARK_H_ */
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..e24bb19414
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; Supported test types:
+; DMA_MEM_COPY|CPU_MEM_COPY
+
+; Parameters:
+; "mem_size","buf_size","dma_ring_size","kick_batch".
+; "mem_size" means the size of the memory footprint.
+; "buf_size" means the memory size of a single operation.
+; "dma_ring_size" means the dma ring buffer size.
+; "kick_batch" means dma operation batch size.
+
+; Format: variable=first[,last,increment[,ADD|MUL]]
+; ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to control if the cache should be flushed.
+
+; repeat_times is used to control the repeat times of the whole case.
+
+; worker_threads is used to control the threads number of the test app.
+; It should be less than the core number.
+
+; mpool_iter_step is used to control the buffer continuity.
+
+; Bind DMA to lcore:
+; Specify the "lcore_dma" parameter.
+; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
+; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
+; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
+; their range.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+repeat_times=10
+worker_threads=1
+mpool_iter_step=1
+lcore_dma=lcore3@0000:00:04.0
+eal_args=--legacy-mem --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+repeat_times=100
+worker_threads=1
+mpool_iter_step=1
+eal_args=--no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..8041f5fdaf
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,434 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdio.h>
+#if !defined(RTE_EXEC_ENV_LINUX)
+
+int
+main(int argc, char *argv[])
+{
+	printf("OS not supported, skipping test\n");
+	return 0;
+}
+
+#else
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),OPS\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, "%s", ",,,,,,,,\n");
+		fprintf(fd, "%s", ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		dma_mem_copy_benchmark(case_cfg);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		cpu_mem_copy_benchmark(case_cfg);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry *var_entry = NULL;
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->nb_workers) {
+		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
+			case_id, nb_lcores, case_cfg->nb_workers);
+		return;
+	}
+
+	RTE_LOG(INFO, DMA, "Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	if (var_entry) {
+		for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+			case_cfg->scenario_id++;
+			printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+			run_test_case(case_cfg);
+			output_csv(false);
+
+			if (var_entry->op == OP_MUL)
+				var_entry->cur *= var_entry->incr;
+			else
+				var_entry->cur += var_entry->incr;
+
+
+		}
+	} else {
+		run_test_case(case_cfg);
+		output_csv(false);
+	}
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else
+			entry->op = OP_ADD;
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static void
+load_configs(void)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char **sections_name;
+	const char *section_name, *case_type;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+
+	sections_name = malloc(MAX_TEST_CASES * sizeof(char *));
+	for (i = 0; i < MAX_TEST_CASES; i++)
+		sections_name[i] = malloc(CFG_NAME_LEN * sizeof(char *));
+
+	cfgfile = rte_cfgfile_load("./config.ini", 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+	rte_cfgfile_sections(cfgfile, sections_name, MAX_TEST_CASES);
+	for (i = 0; i < nb_sections; i++) {
+		test_case = &test_cases[i];
+		section_name = sections_name[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d\n.", i + 1);
+			exit(1);
+		}
+		if (!strcmp(case_type, DMA_MEM_COPY)) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+		} else if (!strcmp(case_type, CPU_MEM_COPY)) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+		} else {
+			printf("Error: Cannot find case type %s.\n", case_type);
+			exit(1);
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "dma_ring_size");
+		args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+		args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (nb_vp > 2) {
+			printf("%s, variable parameters can only have one.\n", section_name);
+			break;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->repeat_times =
+			(uint32_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "repeat_times"));
+		test_case->nb_workers =
+			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "worker_threads"));
+		test_case->mpool_iter_step =
+			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "mpool_iter_step"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+	}
+
+	rte_cfgfile_close(cfgfile);
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (sections_name[i] != NULL)
+			free(sections_name[i]);
+	}
+	free(sections_name);
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int new_argc, token_nb;
+
+	new_argc = argc;
+
+	for (i = 0; i < argc; i++)
+		strcpy(new_argv[i], argv[i]);
+
+	if (eal_args) {
+		strcpy(args, eal_args);
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc __maybe_unused, char *argv[] __maybe_unused)
+{
+	int ret;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[100];
+	int new_argc;
+
+
+	memset(args, 0, sizeof(args));
+	for (i = 0; i < 100; i++)
+		pargs[i] = args[i];
+
+	load_configs();
+	fd = fopen("./test_result.csv", "w");
+	if (!fd) {
+		printf("Open output CSV file error.\n");
+		return 0;
+	}
+	fclose(fd);
+
+	/* loop each case, run it */
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (test_cases[i].test_type != TEST_TYPE_NONE) {
+			cpid = fork();
+			if (cpid < 0) {
+				printf("Fork case %d failed.\n", i + 1);
+				exit(EXIT_FAILURE);
+			} else if (cpid == 0) {
+				printf("\nRunning case %u\n", i + 1);
+
+				if (test_cases[i].eal_args) {
+					new_argc = append_eal_args(argc, argv,
+						test_cases[i].eal_args, pargs);
+
+					ret = rte_eal_init(new_argc, pargs);
+				} else {
+					ret = rte_eal_init(argc, argv);
+				}
+				if (ret < 0)
+					rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+				/* Check lcores. */
+				nb_lcores = rte_lcore_count();
+				if (nb_lcores < 2)
+					rte_exit(EXIT_FAILURE,
+						"There should be at least 2 worker lcores.\n");
+
+				fd = fopen("./test_result.csv", "a");
+				if (!fd) {
+					printf("Open output CSV file error.\n");
+					return 0;
+				}
+
+				if (i == 0)
+					output_env_info();
+				run_test(i + 1, &test_cases[i]);
+
+				/* clean up the EAL */
+				rte_eal_cleanup();
+
+				fclose(fd);
+
+				printf("\nCase %u completed.\n", i + 1);
+
+				exit(EXIT_SUCCESS);
+			} else {
+				wpid = waitpid(cpid, &wstatus, 0);
+				if (wpid == -1) {
+					printf("waitpid error.\n");
+					exit(EXIT_FAILURE);
+				}
+
+				if (WIFEXITED(wstatus))
+					printf("Case process exited. status %d\n",
+						WEXITSTATUS(wstatus));
+				else if (WIFSIGNALED(wstatus))
+					printf("Case process killed by signal %d\n",
+						WTERMSIG(wstatus));
+				else if (WIFSTOPPED(wstatus))
+					printf("Case process stopped by signal %d\n",
+						WSTOPSIG(wstatus));
+				else if (WIFCONTINUED(wstatus))
+					printf("Case process continued.\n");
+				else
+					printf("Case process unknown terminated.\n");
+			}
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
+#endif
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..a8fcf4f34d
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct test_configure {
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint32_t repeat_times;
+	uint32_t nb_workers;
+	uint16_t mpool_iter_step;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..001f67f6c1
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2022 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
--
2.35.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] app/dma-perf: introduce dma-perf application
  2023-01-17  1:56 ` [PATCH v2] " Cheng Jiang
@ 2023-01-17 13:00   ` Bruce Richardson
  2023-01-17 13:54     ` Jiang, Cheng1
  0 siblings, 1 reply; 15+ messages in thread
From: Bruce Richardson @ 2023-01-17 13:00 UTC (permalink / raw)
  To: Cheng Jiang
  Cc: thomas, mb, dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang,
	xingguang.he

On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v2: fixed some CI issues.

Some first review comments inline below. More will likely follow as I
review it further and try testing it out.

/Bruce

> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 539 ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/benchmark.h |  12 +
>  app/test-dma-perf/config.ini  |  61 ++++
>  app/test-dma-perf/main.c      | 434 +++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  53 ++++
>  app/test-dma-perf/meson.build |  22 ++
>  7 files changed, 1122 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c
>  create mode 100644 app/test-dma-perf/benchmark.h
>  create mode 100644 app/test-dma-perf/config.ini
>  create mode 100644 app/test-dma-perf/main.c
>  create mode 100644 app/test-dma-perf/main.h
>  create mode 100644 app/test-dma-perf/meson.build
> 
> diff --git a/app/meson.build b/app/meson.build
> index e32ea4bd5c..a060ad2725 100644
> --- a/app/meson.build
> +++ b/app/meson.build
> @@ -28,6 +28,7 @@ apps = [
>          'test-regex',
>          'test-sad',
>          'test-security-perf',
> +        'test-dma-perf',
>  ]

Lists in DPDK are always alphabetical when no other order is required,
therefore this new app should be further up the list, after
"test-crypto-perf".

> 
>  default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API']
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> new file mode 100644
> index 0000000000..1cb5b0b291
> --- /dev/null
> +++ b/app/test-dma-perf/benchmark.c
> @@ -0,0 +1,539 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <inttypes.h>
> +
> +#include <rte_time.h>
> +#include <rte_mbuf.h>
> +#include <rte_dmadev.h>
> +#include <rte_malloc.h>
> +#include <rte_lcore.h>
> +
> +#include "main.h"
> +#include "benchmark.h"
> +
> +
> +#define MAX_DMA_CPL_NB 255
> +
> +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
> +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
> +
> +struct lcore_params {
> +	uint16_t dev_id;
> +	uint32_t nr_buf;
> +	uint16_t kick_batch;
> +	uint32_t buf_size;
> +	uint32_t repeat_times;
> +	uint16_t mpool_iter_step;
> +	struct rte_mbuf **srcs;
> +	struct rte_mbuf **dsts;
> +	uint8_t scenario_id;
> +};
> +
> +struct buf_info {
> +	struct rte_mbuf **array;
> +	uint32_t nr_buf;
> +	uint32_t buf_size;
> +};
> +
> +static struct rte_mempool *src_pool;
> +static struct rte_mempool *dst_pool;
> +
> +uint16_t dmadev_ids[MAX_WORKER_NB];
> +uint32_t nb_dmadevs;
> +
> +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> +
> +static inline int
> +__rte_format_printf(3, 4)
> +print_err(const char *func, int lineno, const char *format, ...)
> +{
> +	va_list ap;
> +	int ret;
> +
> +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> +	va_start(ap, format);
> +	ret += vfprintf(stderr, format, ap);
> +	va_end(ap);
> +
> +	return ret;
> +}
> +
> +static inline void
> +calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec,
> +			uint32_t repeat_times, uint32_t *memory, uint64_t *ave_cycle,
> +			float *bandwidth, uint64_t *ops)
> +{
> +	*memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
> +	*ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
> +	*bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 * 1000 * 1000.0);
> +	*ops = (double)p->nr_buf * repeat_times / time_sec;
> +}
> +
> +static void
> +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
> +			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> +			float bandwidth, uint64_t ops, bool is_dma)
> +{
> +	if (is_dma)
> +		printf("lcore %u, DMA %u:\n"
> +				"average cycles: %" PRIu64 ","
> +				" buffer size: %u, nr_buf: %u,"
> +				" memory: %uMB, frequency: %" PRIu64 ".\n",
> +				lcore_id,
> +				dev_id,
> +				ave_cycle,
> +				buf_size,
> +				nr_buf,
> +				memory,
> +				rte_get_timer_hz());

Longer lines are allowed for strings, so you can merge each line of output
to a single line, which will improve readability.
Also, to shorten the code, there is no reason each parameter needs to go on
its own line.

> +	else
> +		printf("lcore %u\n"
> +			"average cycles: %" PRIu64 ","
> +			" buffer size: %u, nr_buf: %u,"
> +			" memory: %uMB, frequency: %" PRIu64 ".\n",
> +			lcore_id,
> +			ave_cycle,
> +			buf_size,
> +			nr_buf,
> +			memory,
> +			rte_get_timer_hz());

Suggestion, rather than duplicating the whole output, only the first line
needs to change based on SW vs HW copies. How about:

	if (is_dma)
		printf("lcore %u, DMA %u\n", lcore_id, dev_id);
	else
		printf("lcore %u\n", lcore_id);
	printf("average cycles: ..." , ...);

> +
> +	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, ops);
> +
> +	if (is_dma)
> +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> +			CSV_LINE_DMA_FMT,
> +			scenario_id, lcore_id, dev_id, buf_size,
> +			nr_buf, memory, ave_cycle, bandwidth, ops);
> +	else
> +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> +			CSV_LINE_CPU_FMT,
> +			scenario_id, lcore_id, buf_size,
> +			nr_buf, memory, ave_cycle, bandwidth, ops);
> +}
> +
> +static inline void
> +cache_flush_buf(void *arg)

For non-x86 builds, you probably need to mark "arg" as unused to avoid
compiler warnings.

Why is the parameter type given as a void pointer, when the type is
unconditionally cast below as "struct buf_info"? Void pointer type should
only be needed if you need to call this via a generic function pointer.

> +{
> +#ifdef RTE_ARCH_X86_64
> +	char *data;
> +	char *addr;
> +	struct buf_info *info = arg;
> +	struct rte_mbuf **srcs = info->array;
> +	uint32_t i, k;
> +
> +	for (i = 0; i < info->nr_buf; i++) {
> +		data = rte_pktmbuf_mtod(srcs[i], char *);
> +		for (k = 0; k < info->buf_size / 64; k++) {
> +			addr = (k * 64 + data);
> +			__builtin_ia32_clflush(addr);
> +		}

inner loop may be shorter by incrementing loop var by 64, rather than dividing
and then multiplying, since you can eliminate variable "addr".
Also can be more readable with a variable rename:

	for (offset = 0; offset < info->buf_size; offset += 64) 
		__buildin_ia32_clflush(data + offset);

> +	}
> +#endif
> +}
> +
> +/* Configuration of device. */
> +static void
> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
> +{
> +	uint16_t vchan = 0;
> +	struct rte_dma_info info;
> +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> +	struct rte_dma_vchan_conf qconf = {
> +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> +		.nb_desc = ring_size
> +	};
> +
> +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
> +
> +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
> +		printf("Error with queue configuration\n");
> +		rte_panic();
> +	}
> +

Inconsistency here - and below too. Either use rte_exit on failure or use
rte_panic, but don't mix them. Panic seems a little severe, so I suggest
just using rte_exit() in all cases.

> +	rte_dma_info_get(dev_id, &info);
> +	if (info.nb_vchans != 1) {
> +		printf("Error, no configured queues reported on device id %u\n", dev_id);
> +		rte_panic();
> +	}
> +	if (rte_dma_start(dev_id) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
> +}
> +
> +static int
> +config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
> +{
> +	int16_t dev_id = rte_dma_next_dev(0);
> +	uint32_t i;
> +
> +	nb_dmadevs = 0;
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		if (dev_id == -1)
> +			goto end;
> +
> +		dmadev_ids[i] = dev_id;
> +		configure_dmadev_queue(dmadev_ids[i], ring_size);
> +		dev_id = rte_dma_next_dev(dev_id + 1);
> +		++nb_dmadevs;

Very minor nit, but I'd suggest swapping these last two lines, incrementing
nb_dmadevs right after configuring the device, but before finding a new
one. It just makes more sense to me.

> +	}
> +
> +end:
> +	if (nb_dmadevs < nb_workers) {
> +		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
> +		return -1;
> +	}
> +
> +	RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs);
> +
> +	return 0;
> +}
> +
> +static inline void
> +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, uint32_t buf_size,
> +			uint16_t mpool_iter_step, struct rte_mbuf **srcs, struct rte_mbuf **dsts)
> +{
> +	int64_t async_cnt = 0;
> +	int nr_cpl = 0;
> +	uint32_t index;
> +	uint16_t offset;
> +	uint32_t i;
> +
> +	for (offset = 0; offset < mpool_iter_step; offset++) {
> +		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf; i++) {

Assignment in the condition part of a loop seems wrong. I suggest reworking
this to avoid it.

> +			if (unlikely(rte_dma_copy(dev_id,
> +						0,
> +						srcs[index]->buf_iova + srcs[index]->data_off,
> +						dsts[index]->buf_iova + dsts[index]->data_off,

rte_pktmbuf_iova() macro can be used here.

> +						buf_size,
> +						0) < 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
> +					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
> +								NULL, NULL);
> +					async_cnt -= nr_cpl;
> +				}
> +				if (rte_dma_copy(dev_id,
> +						0,
> +						srcs[index]->buf_iova + srcs[index]->data_off,
> +						dsts[index]->buf_iova + dsts[index]->data_off,
> +						buf_size,
> +						0) < 0) {
> +					printf("enqueue fail again at %u\n", index);
> +					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
> +					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
> +				}
> +			}
> +			async_cnt++;
> +
> +			/**
> +			 * When '&' is used to wrap an index, mask must be a power of 2.
> +			 * That is, kick_batch must be 2^n.

I assume that is checked on input processing when parsing the config file?

> +			 */
> +			if (unlikely((async_cnt % kick_batch) == 0)) {

This is an expected condition that will occur with repeatable frequency.
Therefore, unlikely is not really appropriate.

> +				rte_dma_submit(dev_id, 0);
> +				/* add a poll to avoid ring full */
> +				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +				async_cnt -= nr_cpl;
> +			}
> +		}
> +
> +		rte_dma_submit(dev_id, 0);
> +		while (async_cnt > 0) {
> +			nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +			async_cnt -= nr_cpl;
> +		}

Do we need a timeout here or in the loop above incase of errors that cause
us to not get all the elements back?

> +	}
> +}
> +
> +static int
> +dma_mem_copy(void *p)
> +{

I see the call to this function within "remote_launch" uses a cast on the
function. I don't think that typecast should be necessary, but if you keep
it, you can avoid using the void pointer here and just mark the input type
as "struct lcore_params" directly.

> +	uint64_t ops;
> +	uint32_t memory;
> +	float bandwidth;
> +	double time_sec;
> +	uint32_t lcore_id = rte_lcore_id();
> +	struct lcore_params *params = (struct lcore_params *)p;
> +	uint32_t repeat_times = params->repeat_times;
> +	uint32_t buf_size = params->buf_size;
> +	uint16_t kick_batch = params->kick_batch;
> +	uint32_t lcore_nr_buf = params->nr_buf;
> +	uint16_t dev_id = params->dev_id;
> +	uint16_t mpool_iter_step = params->mpool_iter_step;
> +	struct rte_mbuf **srcs = params->srcs;
> +	struct rte_mbuf **dsts = params->dsts;
> +	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
> +	uint32_t r;
> +
> +	begin = rte_rdtsc();
> +
> +	for (r = 0; r < repeat_times; r++)
> +		do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size,
> +			mpool_iter_step, srcs, dsts);
> +
> +	end = rte_rdtsc();
> +	total_cycles = end - begin;

You can do without "end" easily enough:
	total_cycles = rte_rdtsc() - begin;

> +	time_sec = (double)total_cycles / rte_get_timer_hz();
> +
> +	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
> +			&avg_cycles, &bandwidth, &ops);
> +	output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, buf_size, lcore_nr_buf,
> +			memory, bandwidth, ops, true);
> +
> +	rte_free(p);
> +
> +	return 0;
> +}
> +
> +static int
> +cpu_mem_copy(void *p)
> +{

Most of comments from above, also apply here.

> +	uint32_t idx;
> +	uint32_t lcore_id;
> +	uint32_t memory;
> +	uint64_t ops;
> +	float bandwidth;
> +	double time_sec;
> +	struct lcore_params *params = (struct lcore_params *)p;
> +	uint32_t repeat_times = params->repeat_times;
> +	uint32_t buf_size = params->buf_size;
> +	uint32_t lcore_nr_buf = params->nr_buf;
> +	uint16_t mpool_iter_step = params->mpool_iter_step;
> +	struct rte_mbuf **srcs = params->srcs;
> +	struct rte_mbuf **dsts = params->dsts;
> +	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
> +	uint32_t k, j, offset;
> +
> +	begin = rte_rdtsc();
> +
> +	for (k = 0; k < repeat_times; k++) {
> +		/* copy buffer form src to dst */
> +		for (offset = 0; offset < mpool_iter_step; offset++) {
> +			for (j = 0; idx = j * mpool_iter_step + offset, idx < lcore_nr_buf; j++) {
> +				rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
> +					(void *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
> +					(size_t)buf_size);
> +			}
> +		}
> +	}
> +
> +	end = rte_rdtsc();
> +	total_cycles = end - begin;
> +	time_sec = (double)total_cycles / rte_get_timer_hz();
> +
> +	lcore_id = rte_lcore_id();
> +
> +	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
> +			&avg_cycles, &bandwidth, &ops);
> +	output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, lcore_nr_buf,
> +			memory, bandwidth, ops, false);
> +
> +	rte_free(p);
> +
> +	return 0;
> +}
> +
> +static int
> +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> +			struct rte_mbuf ***dsts)
> +{
> +	uint32_t i;
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	unsigned int nr_sockets;
> +	uint32_t nr_buf = cfg->nr_buf;
> +
> +	nr_sockets = rte_socket_count();
> +	if (cfg->src_numa_node >= nr_sockets ||
> +		cfg->dst_numa_node >= nr_sockets) {
> +		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
> +		return -1;
> +	}
> +
> +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> +			nr_buf, /* n == num elements */
> +			64,  /* cache size */
> +			0,   /* priv size */
> +			buf_size + RTE_PKTMBUF_HEADROOM,
> +			cfg->src_numa_node);
> +	if (src_pool == NULL) {
> +		PRINT_ERR("Error with source mempool creation.\n");
> +		return -1;
> +	}
> +
> +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> +			nr_buf, /* n == num elements */
> +			64,  /* cache size */
> +			0,   /* priv size */
> +			buf_size + RTE_PKTMBUF_HEADROOM,
> +			cfg->dst_numa_node);
> +	if (dst_pool == NULL) {
> +		PRINT_ERR("Error with destination mempool creation.\n");
> +		return -1;
> +	}
> +
> +	*srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));

Typecast for void * to other types aren't actually necessary in C.
I note some inconsistency in this file with regards to malloc. Here you use
regular malloc, while when building the parameters to pass to the memcpy
functions you use rte_malloc. I suggest standardizing on one or the other
rather than mixing.

> +	if (*srcs == NULL) {
> +		printf("Error: srcs malloc failed.\n");
> +		return -1;
> +	}
> +
> +	*dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
> +	if (*dsts == NULL) {
> +		printf("Error: dsts malloc failed.\n");
> +		return -1;
> +	}
> +
> +	for (i = 0; i < nr_buf; i++) {
> +		(*srcs)[i] = rte_pktmbuf_alloc(src_pool);
> +		(*dsts)[i] = rte_pktmbuf_alloc(dst_pool);

Rather than individually allocating you may well manage with
rte_mempool_get_bulk() to allocate all mbufs in one call.

> +		if ((!(*srcs)[i]) || (!(*dsts)[i])) {
> +			printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
> +			return -1;
> +		}
> +
> +		(*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
> +		(*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;

rte_pktmbuf_append() macro can be used here, rather than setting the
lengths manually. However, these values are not actually used anywhere else
in the code, I believe, so setting them is unnecessary. You are manually
tracking the copy lengths throughout the test, and nothing else is working
on the mbufs, so the length the mbuf reports is immaterial..


> +	}
> +
> +	return 0;
> +}
> +
> +void
> +dma_mem_copy_benchmark(struct test_configure *cfg)
> +{
> +	uint32_t i;
> +	uint32_t offset;
> +	unsigned int lcore_id  = 0;
> +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	uint16_t kick_batch = cfg->kick_batch.cur;
> +	uint16_t mpool_iter_step = cfg->mpool_iter_step;
> +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> +	uint16_t nb_workers = cfg->nb_workers;
> +	uint32_t repeat_times = cfg->repeat_times;
> +
> +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +		goto out;
> +
> +	if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
> +		goto out;
> +
> +	if (cfg->cache_flush) {
> +		struct buf_info info;
> +
> +		info.array = srcs;
> +		info.buf_size = buf_size;
> +		info.nr_buf = nr_buf;
> +		cache_flush_buf(&info);
> +

From what I can see, struct buf_info is only used for passing parameters to
the cache_flush_buf function. The code would be a lot simpler to remove the
structure and just pass 3 parameters to the function directly.

> +		info.array = dsts;
> +		cache_flush_buf(&info);
> +		rte_mb();
> +	}
> +
> +	printf("Start testing....\n");
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		lcore_id = rte_get_next_lcore(lcore_id, true, true);
> +		offset = nr_buf / nb_workers * i;
> +
> +		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
> +		if (!p) {
> +			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
> +			break;
> +		}
> +		*p = (struct lcore_params) {
> +			dmadev_ids[i],
> +			(uint32_t)(nr_buf/nb_workers),
> +			kick_batch,
> +			buf_size,
> +			repeat_times,
> +			mpool_iter_step,
> +			srcs + offset,
> +			dsts + offset,
> +			cfg->scenario_id
> +		};
> +
> +		rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, lcore_id);
> +	}
> +
> +	rte_eal_mp_wait_lcore();
> +
> +out:
> +	/* free env */
> +	if (srcs) {
> +		for (i = 0; i < nr_buf; i++)
> +			rte_pktmbuf_free(srcs[i]);
> +		free(srcs);
> +	}
> +	if (dsts) {
> +		for (i = 0; i < nr_buf; i++)
> +			rte_pktmbuf_free(dsts[i]);
> +		free(dsts);
> +	}
> +
> +	if (src_pool)
> +		rte_mempool_free(src_pool);
> +	if (dst_pool)
> +		rte_mempool_free(dst_pool);
> +
> +	for (i = 0; i < nb_dmadevs; i++) {
> +		printf("Stopping dmadev %d\n", dmadev_ids[i]);
> +		rte_dma_stop(dmadev_ids[i]);
> +	}
> +}
> +
> +void
> +cpu_mem_copy_benchmark(struct test_configure *cfg)
> +{
> +	uint32_t i, offset;
> +	uint32_t repeat_times = cfg->repeat_times;
> +	uint32_t kick_batch = cfg->kick_batch.cur;
> +	uint32_t buf_size = cfg->buf_size.cur;
> +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> +	uint16_t nb_workers = cfg->nb_workers;
> +	uint16_t mpool_iter_step = cfg->mpool_iter_step;
> +	struct rte_mbuf **srcs  = NULL, **dsts  = NULL;
> +	unsigned int lcore_id = 0;
> +
> +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +		goto out;
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? 1 : 0, 1);
> +		offset = nr_buf / nb_workers * i;
> +		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
> +		if (!p) {
> +			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
> +			break;
> +		}
> +		*p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch,
> +						buf_size, repeat_times, mpool_iter_step,
> +						srcs + offset, dsts + offset, cfg->scenario_id };

Formatting should be the same as function above.

> +		rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, lcore_id);
> +	}
> +
> +	rte_eal_mp_wait_lcore();
> +
> +out:
> +	/* free env */
> +	if (srcs) {
> +		for (i = 0; i < nr_buf; i++)
> +			rte_pktmbuf_free(srcs[i]);
> +		free(srcs);
> +	}
> +	if (dsts) {
> +		for (i = 0; i < nr_buf; i++)
> +			rte_pktmbuf_free(dsts[i]);
> +		free(dsts);
> +	}
> +
> +	if (src_pool)
> +		rte_mempool_free(src_pool);
> +	if (dst_pool)
> +		rte_mempool_free(dst_pool);
> +}

There seems a quite a bit of common code between the dma_mem_copy_benchmark
and cpu_mem_copy_benchmark. Might be worth investigating if they can be
merged while still keeping readability.

> diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h
> new file mode 100644
> index 0000000000..f5ad8d6d99
> --- /dev/null
> +++ b/app/test-dma-perf/benchmark.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#ifndef _BENCHMARK_H_
> +#define _BENCHMARK_H_
> +
> +void dma_mem_copy_benchmark(struct test_configure *cfg);
> +
> +void cpu_mem_copy_benchmark(struct test_configure *cfg);
> +
> +#endif /* _BENCHMARK_H_ */

You don't really need two separate headers in this application. Both main.h
and benchmark.h can be merged into one header, since both are always
included in both c files.

> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> new file mode 100644
> index 0000000000..e24bb19414
> --- /dev/null
> +++ b/app/test-dma-perf/config.ini
> @@ -0,0 +1,61 @@
> +
> +; Supported test types:
> +; DMA_MEM_COPY|CPU_MEM_COPY
> +
> +; Parameters:
> +; "mem_size","buf_size","dma_ring_size","kick_batch".
> +; "mem_size" means the size of the memory footprint.
> +; "buf_size" means the memory size of a single operation.
> +; "dma_ring_size" means the dma ring buffer size.
> +; "kick_batch" means dma operation batch size.
> +
> +; Format: variable=first[,last,increment[,ADD|MUL]]
> +; ADD is the default mode.
> +
> +; src_numa_node is used to control the numa node where the source memory is allocated.
> +; dst_numa_node is used to control the numa node where the destination memory is allocated.
> +
> +; cache_flush is used to control if the cache should be flushed.
> +
> +; repeat_times is used to control the repeat times of the whole case.
> +
> +; worker_threads is used to control the threads number of the test app.
> +; It should be less than the core number.
> +
> +; mpool_iter_step is used to control the buffer continuity.
> +
> +; Bind DMA to lcore:
> +; Specify the "lcore_dma" parameter.
> +; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
> +; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
> +; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
> +; their range.
> +
> +[case1]
> +type=DMA_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +repeat_times=10
> +worker_threads=1
> +mpool_iter_step=1
> +lcore_dma=lcore3@0000:00:04.0
> +eal_args=--legacy-mem --file-prefix=test
> +
> +[case2]
> +type=CPU_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=1
> +cache_flush=0
> +repeat_times=100
> +worker_threads=1
> +mpool_iter_step=1
> +eal_args=--no-pci
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> new file mode 100644
> index 0000000000..94ba369539
> --- /dev/null
> +++ b/app/test-dma-perf/main.c
> @@ -0,0 +1,434 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#if !defined(RTE_EXEC_ENV_LINUX)
> +
> +int
> +main(int argc, char *argv[])
> +{
> +	printf("OS not supported, skipping test\n");
> +	return 0;
> +}
> +

What is linux-specific about this app?

If we do need to limit the app to Linux-only I suggest using meson to do so
rather than putting #ifdefs in the code.

> +#else
> +
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <signal.h>

<snip>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH v2] app/dma-perf: introduce dma-perf application
  2023-01-17 13:00   ` Bruce Richardson
@ 2023-01-17 13:54     ` Jiang, Cheng1
  2023-01-17 14:03       ` Bruce Richardson
  0 siblings, 1 reply; 15+ messages in thread
From: Jiang, Cheng1 @ 2023-01-17 13:54 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

Hi Bruce,

Thanks for your comments.
Replies are inline. I'll fix them in the next version.

Thanks,
Cheng

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Tuesday, January 17, 2023 9:00 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org;
> Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma,
> WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v2] app/dma-perf: introduce dma-perf application
> 
> On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote:
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre-set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> > v2: fixed some CI issues.
> 
> Some first review comments inline below. More will likely follow as I review it
> further and try testing it out.
> 
> /Bruce
> 
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 539
> > ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/benchmark.h |  12 +  app/test-dma-perf/config.ini
> > |  61 ++++
> >  app/test-dma-perf/main.c      | 434 +++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  53 ++++
> >  app/test-dma-perf/meson.build |  22 ++
> >  7 files changed, 1122 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> > app/test-dma-perf/benchmark.h  create mode 100644
> > app/test-dma-perf/config.ini  create mode 100644
> > app/test-dma-perf/main.c  create mode 100644 app/test-dma-perf/main.h
> > create mode 100644 app/test-dma-perf/meson.build
> >
> > diff --git a/app/meson.build b/app/meson.build index
> > e32ea4bd5c..a060ad2725 100644
> > --- a/app/meson.build
> > +++ b/app/meson.build
> > @@ -28,6 +28,7 @@ apps = [
> >          'test-regex',
> >          'test-sad',
> >          'test-security-perf',
> > +        'test-dma-perf',
> >  ]
> 
> Lists in DPDK are always alphabetical when no other order is required,
> therefore this new app should be further up the list, after "test-crypto-perf".

Sure, I'll fix it in the next version.

> 
> >
> >  default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API'] diff
> > --git a/app/test-dma-perf/benchmark.c b/app/test-dma-
> perf/benchmark.c
> > new file mode 100644 index 0000000000..1cb5b0b291
> > --- /dev/null
> > +++ b/app/test-dma-perf/benchmark.c
> > @@ -0,0 +1,539 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation  */
> > +
> > +#include <inttypes.h>
> > +
> > +#include <rte_time.h>
> > +#include <rte_mbuf.h>
> > +#include <rte_dmadev.h>
> > +#include <rte_malloc.h>
> > +#include <rte_lcore.h>
> > +
> > +#include "main.h"
> > +#include "benchmark.h"
> > +
> > +
> > +#define MAX_DMA_CPL_NB 255
> > +
> > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64
> ",%.3lf,%" PRIu64 "\n"
> > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64
> ",%.3lf,%" PRIu64 "\n"
> > +
> > +struct lcore_params {
> > +	uint16_t dev_id;
> > +	uint32_t nr_buf;
> > +	uint16_t kick_batch;
> > +	uint32_t buf_size;
> > +	uint32_t repeat_times;
> > +	uint16_t mpool_iter_step;
> > +	struct rte_mbuf **srcs;
> > +	struct rte_mbuf **dsts;
> > +	uint8_t scenario_id;
> > +};
> > +
> > +struct buf_info {
> > +	struct rte_mbuf **array;
> > +	uint32_t nr_buf;
> > +	uint32_t buf_size;
> > +};
> > +
> > +static struct rte_mempool *src_pool;
> > +static struct rte_mempool *dst_pool;
> > +
> > +uint16_t dmadev_ids[MAX_WORKER_NB];
> > +uint32_t nb_dmadevs;
> > +
> > +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> > +
> > +static inline int
> > +__rte_format_printf(3, 4)
> > +print_err(const char *func, int lineno, const char *format, ...) {
> > +	va_list ap;
> > +	int ret;
> > +
> > +	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> > +	va_start(ap, format);
> > +	ret += vfprintf(stderr, format, ap);
> > +	va_end(ap);
> > +
> > +	return ret;
> > +}
> > +
> > +static inline void
> > +calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double
> time_sec,
> > +			uint32_t repeat_times, uint32_t *memory, uint64_t
> *ave_cycle,
> > +			float *bandwidth, uint64_t *ops)
> > +{
> > +	*memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
> > +	*ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
> > +	*bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle *
> 1000 * 1000 * 1000.0);
> > +	*ops = (double)p->nr_buf * repeat_times / time_sec; }
> > +
> > +static void
> > +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id,
> uint64_t ave_cycle,
> > +			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> > +			float bandwidth, uint64_t ops, bool is_dma) {
> > +	if (is_dma)
> > +		printf("lcore %u, DMA %u:\n"
> > +				"average cycles: %" PRIu64 ","
> > +				" buffer size: %u, nr_buf: %u,"
> > +				" memory: %uMB, frequency: %" PRIu64
> ".\n",
> > +				lcore_id,
> > +				dev_id,
> > +				ave_cycle,
> > +				buf_size,
> > +				nr_buf,
> > +				memory,
> > +				rte_get_timer_hz());
> 
> Longer lines are allowed for strings, so you can merge each line of output to a
> single line, which will improve readability.
> Also, to shorten the code, there is no reason each parameter needs to go on
> its own line.

Yes, totally make sense. I'll fix it in the next version.

> 
> > +	else
> > +		printf("lcore %u\n"
> > +			"average cycles: %" PRIu64 ","
> > +			" buffer size: %u, nr_buf: %u,"
> > +			" memory: %uMB, frequency: %" PRIu64 ".\n",
> > +			lcore_id,
> > +			ave_cycle,
> > +			buf_size,
> > +			nr_buf,
> > +			memory,
> > +			rte_get_timer_hz());
> 
> Suggestion, rather than duplicating the whole output, only the first line
> needs to change based on SW vs HW copies. How about:
> 
> 	if (is_dma)
> 		printf("lcore %u, DMA %u\n", lcore_id, dev_id);
> 	else
> 		printf("lcore %u\n", lcore_id);
> 	printf("average cycles: ..." , ...);
> 

Got it, good point. I'll fix it.

> > +
> > +	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n",
> > +bandwidth, ops);
> > +
> > +	if (is_dma)
> > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > +			CSV_LINE_DMA_FMT,
> > +			scenario_id, lcore_id, dev_id, buf_size,
> > +			nr_buf, memory, ave_cycle, bandwidth, ops);
> > +	else
> > +		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> > +			CSV_LINE_CPU_FMT,
> > +			scenario_id, lcore_id, buf_size,
> > +			nr_buf, memory, ave_cycle, bandwidth, ops); }
> > +
> > +static inline void
> > +cache_flush_buf(void *arg)
> 
> For non-x86 builds, you probably need to mark "arg" as unused to avoid
> compiler warnings.

Sure, I was wandering how to avoid compiler warnings, thanks for your advice.

> 
> Why is the parameter type given as a void pointer, when the type is
> unconditionally cast below as "struct buf_info"? Void pointer type should
> only be needed if you need to call this via a generic function pointer.

You are right, I'll fix it.

> 
> > +{
> > +#ifdef RTE_ARCH_X86_64
> > +	char *data;
> > +	char *addr;
> > +	struct buf_info *info = arg;
> > +	struct rte_mbuf **srcs = info->array;
> > +	uint32_t i, k;
> > +
> > +	for (i = 0; i < info->nr_buf; i++) {
> > +		data = rte_pktmbuf_mtod(srcs[i], char *);
> > +		for (k = 0; k < info->buf_size / 64; k++) {
> > +			addr = (k * 64 + data);
> > +			__builtin_ia32_clflush(addr);
> > +		}
> 
> inner loop may be shorter by incrementing loop var by 64, rather than
> dividing and then multiplying, since you can eliminate variable "addr".
> Also can be more readable with a variable rename:
> 
> 	for (offset = 0; offset < info->buf_size; offset += 64)
> 		__buildin_ia32_clflush(data + offset);
> 

Sure, totally make sense to me, thanks.

> > +	}
> > +#endif
> > +}
> > +
> > +/* Configuration of device. */
> > +static void
> > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > +	uint16_t vchan = 0;
> > +	struct rte_dma_info info;
> > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> > +	struct rte_dma_vchan_conf qconf = {
> > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > +		.nb_desc = ring_size
> > +	};
> > +
> > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
> > +
> > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
> > +		printf("Error with queue configuration\n");
> > +		rte_panic();
> > +	}
> > +
> 
> Inconsistency here - and below too. Either use rte_exit on failure or use
> rte_panic, but don't mix them. Panic seems a little severe, so I suggest just
> using rte_exit() in all cases.

Sure.

> 
> > +	rte_dma_info_get(dev_id, &info);
> > +	if (info.nb_vchans != 1) {
> > +		printf("Error, no configured queues reported on device
> id %u\n", dev_id);
> > +		rte_panic();
> > +	}
> > +	if (rte_dma_start(dev_id) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n"); }
> > +
> > +static int
> > +config_dmadevs(uint32_t nb_workers, uint32_t ring_size) {
> > +	int16_t dev_id = rte_dma_next_dev(0);
> > +	uint32_t i;
> > +
> > +	nb_dmadevs = 0;
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		if (dev_id == -1)
> > +			goto end;
> > +
> > +		dmadev_ids[i] = dev_id;
> > +		configure_dmadev_queue(dmadev_ids[i], ring_size);
> > +		dev_id = rte_dma_next_dev(dev_id + 1);
> > +		++nb_dmadevs;
> 
> Very minor nit, but I'd suggest swapping these last two lines, incrementing
> nb_dmadevs right after configuring the device, but before finding a new one.
> It just makes more sense to me.

Sure.

> 
> > +	}
> > +
> > +end:
> > +	if (nb_dmadevs < nb_workers) {
> > +		printf("Not enough dmadevs (%u) for all workers (%u).\n",
> nb_dmadevs, nb_workers);
> > +		return -1;
> > +	}
> > +
> > +	RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n",
> nb_dmadevs);
> > +
> > +	return 0;
> > +}
> > +
> > +static inline void
> > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t
> kick_batch, uint32_t buf_size,
> > +			uint16_t mpool_iter_step, struct rte_mbuf **srcs,
> struct rte_mbuf
> > +**dsts) {
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t index;
> > +	uint16_t offset;
> > +	uint32_t i;
> > +
> > +	for (offset = 0; offset < mpool_iter_step; offset++) {
> > +		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf;
> > +i++) {
> 
> Assignment in the condition part of a loop seems wrong. I suggest reworking
> this to avoid it.

Sure, I'll reconsider it.

> 
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						srcs[index]->buf_iova +
> srcs[index]->data_off,
> > +						dsts[index]->buf_iova +
> dsts[index]->data_off,
> 
> rte_pktmbuf_iova() macro can be used here.

Sure, sorry I missed it.

> 
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0) {
> > +					nr_cpl = rte_dma_completed(dev_id,
> 0, MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						srcs[index]->buf_iova +
> srcs[index]->data_off,
> > +						dsts[index]->buf_iova +
> dsts[index]->data_off,
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> index);
> > +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			/**
> > +			 * When '&' is used to wrap an index, mask must be a
> power of 2.
> > +			 * That is, kick_batch must be 2^n.
> 
> I assume that is checked on input processing when parsing the config file?

I'll check it in the next version.

> 
> > +			 */
> > +			if (unlikely((async_cnt % kick_batch) == 0)) {
> 
> This is an expected condition that will occur with repeatable frequency.
> Therefore, unlikely is not really appropriate.

Sure, I'll reconsider it.

> 
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +			}
> > +		}
> > +
> > +		rte_dma_submit(dev_id, 0);
> > +		while (async_cnt > 0) {
> > +			nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> > +			async_cnt -= nr_cpl;
> > +		}
> 
> Do we need a timeout here or in the loop above incase of errors that cause
> us to not get all the elements back?

Make sense, I'll consider it in the next version. Thanks.

> 
> > +	}
> > +}
> > +
> > +static int
> > +dma_mem_copy(void *p)
> > +{
> 
> I see the call to this function within "remote_launch" uses a cast on the
> function. I don't think that typecast should be necessary, but if you keep it,
> you can avoid using the void pointer here and just mark the input type as
> "struct lcore_params" directly.

OK, make sense to me.

> 
> > +	uint64_t ops;
> > +	uint32_t memory;
> > +	float bandwidth;
> > +	double time_sec;
> > +	uint32_t lcore_id = rte_lcore_id();
> > +	struct lcore_params *params = (struct lcore_params *)p;
> > +	uint32_t repeat_times = params->repeat_times;
> > +	uint32_t buf_size = params->buf_size;
> > +	uint16_t kick_batch = params->kick_batch;
> > +	uint32_t lcore_nr_buf = params->nr_buf;
> > +	uint16_t dev_id = params->dev_id;
> > +	uint16_t mpool_iter_step = params->mpool_iter_step;
> > +	struct rte_mbuf **srcs = params->srcs;
> > +	struct rte_mbuf **dsts = params->dsts;
> > +	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
> > +	uint32_t r;
> > +
> > +	begin = rte_rdtsc();
> > +
> > +	for (r = 0; r < repeat_times; r++)
> > +		do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch,
> buf_size,
> > +			mpool_iter_step, srcs, dsts);
> > +
> > +	end = rte_rdtsc();
> > +	total_cycles = end - begin;
> 
> You can do without "end" easily enough:
> 	total_cycles = rte_rdtsc() - begin;

Got it, thanks for your advice.

> 
> > +	time_sec = (double)total_cycles / rte_get_timer_hz();
> > +
> > +	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
> > +			&avg_cycles, &bandwidth, &ops);
> > +	output_result(params->scenario_id, lcore_id, dev_id, avg_cycles,
> buf_size, lcore_nr_buf,
> > +			memory, bandwidth, ops, true);
> > +
> > +	rte_free(p);
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +cpu_mem_copy(void *p)
> > +{
> 
> Most of comments from above, also apply here.

Sure, I'll fix them in the next version.

> 
> > +	uint32_t idx;
> > +	uint32_t lcore_id;
> > +	uint32_t memory;
> > +	uint64_t ops;
> > +	float bandwidth;
> > +	double time_sec;
> > +	struct lcore_params *params = (struct lcore_params *)p;
> > +	uint32_t repeat_times = params->repeat_times;
> > +	uint32_t buf_size = params->buf_size;
> > +	uint32_t lcore_nr_buf = params->nr_buf;
> > +	uint16_t mpool_iter_step = params->mpool_iter_step;
> > +	struct rte_mbuf **srcs = params->srcs;
> > +	struct rte_mbuf **dsts = params->dsts;
> > +	uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
> > +	uint32_t k, j, offset;
> > +
> > +	begin = rte_rdtsc();
> > +
> > +	for (k = 0; k < repeat_times; k++) {
> > +		/* copy buffer form src to dst */
> > +		for (offset = 0; offset < mpool_iter_step; offset++) {
> > +			for (j = 0; idx = j * mpool_iter_step + offset, idx <
> lcore_nr_buf; j++) {
> > +				rte_memcpy((void
> *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
> > +					(void
> *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
> > +					(size_t)buf_size);
> > +			}
> > +		}
> > +	}
> > +
> > +	end = rte_rdtsc();
> > +	total_cycles = end - begin;
> > +	time_sec = (double)total_cycles / rte_get_timer_hz();
> > +
> > +	lcore_id = rte_lcore_id();
> > +
> > +	calc_result(params, total_cycles, time_sec, repeat_times, &memory,
> > +			&avg_cycles, &bandwidth, &ops);
> > +	output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size,
> lcore_nr_buf,
> > +			memory, bandwidth, ops, false);
> > +
> > +	rte_free(p);
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> > +			struct rte_mbuf ***dsts)
> > +{
> > +	uint32_t i;
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	unsigned int nr_sockets;
> > +	uint32_t nr_buf = cfg->nr_buf;
> > +
> > +	nr_sockets = rte_socket_count();
> > +	if (cfg->src_numa_node >= nr_sockets ||
> > +		cfg->dst_numa_node >= nr_sockets) {
> > +		printf("Error: Source or destination numa exceeds the acture
> numa nodes.\n");
> > +		return -1;
> > +	}
> > +
> > +	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> > +			nr_buf, /* n == num elements */
> > +			64,  /* cache size */
> > +			0,   /* priv size */
> > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > +			cfg->src_numa_node);
> > +	if (src_pool == NULL) {
> > +		PRINT_ERR("Error with source mempool creation.\n");
> > +		return -1;
> > +	}
> > +
> > +	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> > +			nr_buf, /* n == num elements */
> > +			64,  /* cache size */
> > +			0,   /* priv size */
> > +			buf_size + RTE_PKTMBUF_HEADROOM,
> > +			cfg->dst_numa_node);
> > +	if (dst_pool == NULL) {
> > +		PRINT_ERR("Error with destination mempool creation.\n");
> > +		return -1;
> > +	}
> > +
> > +	*srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf
> > +*)));
> 
> Typecast for void * to other types aren't actually necessary in C.
> I note some inconsistency in this file with regards to malloc. Here you use
> regular malloc, while when building the parameters to pass to the memcpy
> functions you use rte_malloc. I suggest standardizing on one or the other
> rather than mixing.

Good point, thanks.

> 
> > +	if (*srcs == NULL) {
> > +		printf("Error: srcs malloc failed.\n");
> > +		return -1;
> > +	}
> > +
> > +	*dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf
> *)));
> > +	if (*dsts == NULL) {
> > +		printf("Error: dsts malloc failed.\n");
> > +		return -1;
> > +	}
> > +
> > +	for (i = 0; i < nr_buf; i++) {
> > +		(*srcs)[i] = rte_pktmbuf_alloc(src_pool);
> > +		(*dsts)[i] = rte_pktmbuf_alloc(dst_pool);
> 
> Rather than individually allocating you may well manage with
> rte_mempool_get_bulk() to allocate all mbufs in one call.

Sure, thanks.

> 
> > +		if ((!(*srcs)[i]) || (!(*dsts)[i])) {
> > +			printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
> > +			return -1;
> > +		}
> > +
> > +		(*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
> > +		(*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;
> 
> rte_pktmbuf_append() macro can be used here, rather than setting the
> lengths manually. However, these values are not actually used anywhere
> else in the code, I believe, so setting them is unnecessary. You are manually
> tracking the copy lengths throughout the test, and nothing else is working on
> the mbufs, so the length the mbuf reports is immaterial..

Sure, it will be fixed.

> 
> 
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +void
> > +dma_mem_copy_benchmark(struct test_configure *cfg) {
> > +	uint32_t i;
> > +	uint32_t offset;
> > +	unsigned int lcore_id  = 0;
> > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > +	uint16_t mpool_iter_step = cfg->mpool_iter_step;
> > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> > +	uint16_t nb_workers = cfg->nb_workers;
> > +	uint32_t repeat_times = cfg->repeat_times;
> > +
> > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +		goto out;
> > +
> > +	if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
> > +		goto out;
> > +
> > +	if (cfg->cache_flush) {
> > +		struct buf_info info;
> > +
> > +		info.array = srcs;
> > +		info.buf_size = buf_size;
> > +		info.nr_buf = nr_buf;
> > +		cache_flush_buf(&info);
> > +
> 
> From what I can see, struct buf_info is only used for passing parameters to
> the cache_flush_buf function. The code would be a lot simpler to remove
> the structure and just pass 3 parameters to the function directly.

Good point, thanks.

> 
> > +		info.array = dsts;
> > +		cache_flush_buf(&info);
> > +		rte_mb();
> > +	}
> > +
> > +	printf("Start testing....\n");
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		lcore_id = rte_get_next_lcore(lcore_id, true, true);
> > +		offset = nr_buf / nb_workers * i;
> > +
> > +		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
> > +		if (!p) {
> > +			printf("lcore parameters malloc failure for
> lcore %d\n", lcore_id);
> > +			break;
> > +		}
> > +		*p = (struct lcore_params) {
> > +			dmadev_ids[i],
> > +			(uint32_t)(nr_buf/nb_workers),
> > +			kick_batch,
> > +			buf_size,
> > +			repeat_times,
> > +			mpool_iter_step,
> > +			srcs + offset,
> > +			dsts + offset,
> > +			cfg->scenario_id
> > +		};
> > +
> > +		rte_eal_remote_launch((lcore_function_t
> *)dma_mem_copy, p, lcore_id);
> > +	}
> > +
> > +	rte_eal_mp_wait_lcore();
> > +
> > +out:
> > +	/* free env */
> > +	if (srcs) {
> > +		for (i = 0; i < nr_buf; i++)
> > +			rte_pktmbuf_free(srcs[i]);
> > +		free(srcs);
> > +	}
> > +	if (dsts) {
> > +		for (i = 0; i < nr_buf; i++)
> > +			rte_pktmbuf_free(dsts[i]);
> > +		free(dsts);
> > +	}
> > +
> > +	if (src_pool)
> > +		rte_mempool_free(src_pool);
> > +	if (dst_pool)
> > +		rte_mempool_free(dst_pool);
> > +
> > +	for (i = 0; i < nb_dmadevs; i++) {
> > +		printf("Stopping dmadev %d\n", dmadev_ids[i]);
> > +		rte_dma_stop(dmadev_ids[i]);
> > +	}
> > +}
> > +
> > +void
> > +cpu_mem_copy_benchmark(struct test_configure *cfg) {
> > +	uint32_t i, offset;
> > +	uint32_t repeat_times = cfg->repeat_times;
> > +	uint32_t kick_batch = cfg->kick_batch.cur;
> > +	uint32_t buf_size = cfg->buf_size.cur;
> > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> > +	uint16_t nb_workers = cfg->nb_workers;
> > +	uint16_t mpool_iter_step = cfg->mpool_iter_step;
> > +	struct rte_mbuf **srcs  = NULL, **dsts  = NULL;
> > +	unsigned int lcore_id = 0;
> > +
> > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +		goto out;
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() >
> 1 ? 1 : 0, 1);
> > +		offset = nr_buf / nb_workers * i;
> > +		struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
> > +		if (!p) {
> > +			printf("lcore parameters malloc failure for
> lcore %d\n", lcore_id);
> > +			break;
> > +		}
> > +		*p = (struct lcore_params) { 0, nr_buf/nb_workers,
> kick_batch,
> > +						buf_size, repeat_times,
> mpool_iter_step,
> > +						srcs + offset, dsts + offset,
> cfg->scenario_id };
> 
> Formatting should be the same as function above.

Sure.

> 
> > +		rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy,
> p, lcore_id);
> > +	}
> > +
> > +	rte_eal_mp_wait_lcore();
> > +
> > +out:
> > +	/* free env */
> > +	if (srcs) {
> > +		for (i = 0; i < nr_buf; i++)
> > +			rte_pktmbuf_free(srcs[i]);
> > +		free(srcs);
> > +	}
> > +	if (dsts) {
> > +		for (i = 0; i < nr_buf; i++)
> > +			rte_pktmbuf_free(dsts[i]);
> > +		free(dsts);
> > +	}
> > +
> > +	if (src_pool)
> > +		rte_mempool_free(src_pool);
> > +	if (dst_pool)
> > +		rte_mempool_free(dst_pool);
> > +}
> 
> There seems a quite a bit of common code between the
> dma_mem_copy_benchmark and cpu_mem_copy_benchmark. Might be
> worth investigating if they can be merged while still keeping readability.

Yes you're right. I'll consider it.

> 
> > diff --git a/app/test-dma-perf/benchmark.h
> > b/app/test-dma-perf/benchmark.h new file mode 100644 index
> > 0000000000..f5ad8d6d99
> > --- /dev/null
> > +++ b/app/test-dma-perf/benchmark.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation  */
> > +
> > +#ifndef _BENCHMARK_H_
> > +#define _BENCHMARK_H_
> > +
> > +void dma_mem_copy_benchmark(struct test_configure *cfg);
> > +
> > +void cpu_mem_copy_benchmark(struct test_configure *cfg);
> > +
> > +#endif /* _BENCHMARK_H_ */
> 
> You don't really need two separate headers in this application. Both main.h
> and benchmark.h can be merged into one header, since both are always
> included in both c files.

Sure, make sense.

> 
> > diff --git a/app/test-dma-perf/config.ini
> > b/app/test-dma-perf/config.ini new file mode 100644 index
> > 0000000000..e24bb19414
> > --- /dev/null
> > +++ b/app/test-dma-perf/config.ini
> > @@ -0,0 +1,61 @@
> > +
> > +; Supported test types:
> > +; DMA_MEM_COPY|CPU_MEM_COPY
> > +
> > +; Parameters:
> > +; "mem_size","buf_size","dma_ring_size","kick_batch".
> > +; "mem_size" means the size of the memory footprint.
> > +; "buf_size" means the memory size of a single operation.
> > +; "dma_ring_size" means the dma ring buffer size.
> > +; "kick_batch" means dma operation batch size.
> > +
> > +; Format: variable=first[,last,increment[,ADD|MUL]]
> > +; ADD is the default mode.
> > +
> > +; src_numa_node is used to control the numa node where the source
> memory is allocated.
> > +; dst_numa_node is used to control the numa node where the
> destination memory is allocated.
> > +
> > +; cache_flush is used to control if the cache should be flushed.
> > +
> > +; repeat_times is used to control the repeat times of the whole case.
> > +
> > +; worker_threads is used to control the threads number of the test app.
> > +; It should be less than the core number.
> > +
> > +; mpool_iter_step is used to control the buffer continuity.
> > +
> > +; Bind DMA to lcore:
> > +; Specify the "lcore_dma" parameter.
> > +; The number of "lcore_dma" should be greater than or equal to the
> number of "worker_threads".
> > +; Otherwise the remaining DMA devices will be automatically allocated
> > +to threads that are not ; specified. If EAL parameters "-l" and "-a"
> > +are specified, the "lcore_dma" should be within ; their range.
> > +
> > +[case1]
> > +type=DMA_MEM_COPY
> > +mem_size=10
> > +buf_size=64,8192,2,MUL
> > +dma_ring_size=1024
> > +kick_batch=32
> > +src_numa_node=0
> > +dst_numa_node=0
> > +cache_flush=0
> > +repeat_times=10
> > +worker_threads=1
> > +mpool_iter_step=1
> > +lcore_dma=lcore3@0000:00:04.0
> > +eal_args=--legacy-mem --file-prefix=test
> > +
> > +[case2]
> > +type=CPU_MEM_COPY
> > +mem_size=10
> > +buf_size=64,8192,2,MUL
> > +dma_ring_size=1024
> > +kick_batch=32
> > +src_numa_node=0
> > +dst_numa_node=1
> > +cache_flush=0
> > +repeat_times=100
> > +worker_threads=1
> > +mpool_iter_step=1
> > +eal_args=--no-pci
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new
> > file mode 100644 index 0000000000..94ba369539
> > --- /dev/null
> > +++ b/app/test-dma-perf/main.c
> > @@ -0,0 +1,434 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation  */
> > +
> > +#include <stdio.h>
> > +#if !defined(RTE_EXEC_ENV_LINUX)
> > +
> > +int
> > +main(int argc, char *argv[])
> > +{
> > +	printf("OS not supported, skipping test\n");
> > +	return 0;
> > +}
> > +
> 
> What is linux-specific about this app?
> 
> If we do need to limit the app to Linux-only I suggest using meson to do so
> rather than putting #ifdefs in the code.

Got it. Thanks!

> 
> > +#else
> > +
> > +#include <stdlib.h>
> > +#include <getopt.h>
> > +#include <signal.h>
> 
> <snip>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] app/dma-perf: introduce dma-perf application
  2023-01-17 13:54     ` Jiang, Cheng1
@ 2023-01-17 14:03       ` Bruce Richardson
  2023-01-18  1:46         ` Jiang, Cheng1
  0 siblings, 1 reply; 15+ messages in thread
From: Bruce Richardson @ 2023-01-17 14:03 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

On Tue, Jan 17, 2023 at 01:54:50PM +0000, Jiang, Cheng1 wrote:
> Hi Bruce,
> 
> Thanks for your comments.
> Replies are inline. I'll fix them in the next version.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Richardson, Bruce <bruce.richardson@intel.com>
> > Sent: Tuesday, January 17, 2023 9:00 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org;
> > Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma,
> > WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> > Subject: Re: [PATCH v2] app/dma-perf: introduce dma-perf application
> > 
> > On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote:

<snip>

> > > +#if !defined(RTE_EXEC_ENV_LINUX)
> > > +
> > > +int
> > > +main(int argc, char *argv[])
> > > +{
> > > +	printf("OS not supported, skipping test\n");
> > > +	return 0;
> > > +}
> > > +
> > 
> > What is linux-specific about this app?
> > 
> > If we do need to limit the app to Linux-only I suggest using meson to do so
> > rather than putting #ifdefs in the code.
> 
> Got it. Thanks!
> 
Can you maybe clarify what is linux-specific about this app, or else
perhaps test at least building on FreeBSD and windows. I'd rather not see
stuff added just for Linux without some attempt to try on other OS's.

/Bruce

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-17 12:05 ` [PATCH v3] " Cheng Jiang
@ 2023-01-17 15:44   ` Bruce Richardson
  2023-01-19  7:18     ` Jiang, Cheng1
  2023-01-17 16:51   ` Bruce Richardson
  1 sibling, 1 reply; 15+ messages in thread
From: Bruce Richardson @ 2023-01-17 15:44 UTC (permalink / raw)
  To: Cheng Jiang
  Cc: thomas, mb, dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang,
	xingguang.he

On Tue, Jan 17, 2023 at 12:05:26PM +0000, Cheng Jiang wrote:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>

Hi,

more review comments inline below.

/Bruce

> ---

<snip>

> eal_args=--legacy-mem --file-prefix=test

Why using legact-mem mode? Rather than these options, just use
"--in-memory" to avoid any conflicts. While this is only an example config,
we should steer people away from legacy memory mode.

> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> new file mode 100644
> index 0000000000..8041f5fdaf
> --- /dev/null
> +++ b/app/test-dma-perf/main.c
> @@ -0,0 +1,434 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#if !defined(RTE_EXEC_ENV_LINUX)
> +
> +int
> +main(int argc, char *argv[])
> +{
> +	printf("OS not supported, skipping test\n");
> +	return 0;
> +}
> +
> +#else
> +
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <unistd.h>
> +#include <sys/wait.h>
> +#include <inttypes.h>
> +
> +#include <rte_eal.h>
> +#include <rte_cfgfile.h>
> +#include <rte_string_fns.h>
> +#include <rte_lcore.h>
> +
> +#include "main.h"
> +#include "benchmark.h"
> +
> +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),OPS\n"
> +
> +#define MAX_EAL_PARAM_NB 100
> +#define MAX_EAL_PARAM_LEN 1024
> +
> +#define DMA_MEM_COPY "DMA_MEM_COPY"
> +#define CPU_MEM_COPY "CPU_MEM_COPY"
> +
> +#define MAX_PARAMS_PER_ENTRY 4
> +
> +enum {
> +	TEST_TYPE_NONE = 0,
> +	TEST_TYPE_DMA_MEM_COPY,
> +	TEST_TYPE_CPU_MEM_COPY
> +};
> +
> +#define MAX_TEST_CASES 16
> +static struct test_configure test_cases[MAX_TEST_CASES];
> +
> +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> +
> +static FILE *fd;
> +
> +static void
> +output_csv(bool need_blankline)
> +{
> +	uint32_t i;
> +
> +	if (need_blankline) {
> +		fprintf(fd, "%s", ",,,,,,,,\n");
> +		fprintf(fd, "%s", ",,,,,,,,\n");
you don't need the "%s" here. The string you are outputting is constant.
> +	}
> +
> +	for (i = 0; i < RTE_DIM(output_str); i++) {
> +		if (output_str[i][0]) {
> +			fprintf(fd, "%s", output_str[i]);
> +			memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);

Rather than zeroing the whole string with memset, would
"output_str[i][0] = '\0';" not work instead?

> +		}
> +	}
> +
> +	fflush(fd);
> +}
> +
> +static void
> +output_env_info(void)
> +{
> +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
> +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
> +
> +	output_csv(true);
> +}
> +
> +static void
> +output_header(uint32_t case_id, struct test_configure *case_cfg)
> +{
> +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> +			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
> +
> +	output_csv(true);
> +}
> +
> +static void
> +run_test_case(struct test_configure *case_cfg)
> +{
> +	switch (case_cfg->test_type) {
> +	case TEST_TYPE_DMA_MEM_COPY:
> +		dma_mem_copy_benchmark(case_cfg);
> +		break;
> +	case TEST_TYPE_CPU_MEM_COPY:
> +		cpu_mem_copy_benchmark(case_cfg);
> +		break;
> +	default:
> +		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> +		break;
> +	}
> +}
> +
> +static void
> +run_test(uint32_t case_id, struct test_configure *case_cfg)
> +{
> +	uint32_t i;
> +	uint32_t nb_lcores = rte_lcore_count();
> +	struct test_configure_entry *mem_size = &case_cfg->mem_size;
> +	struct test_configure_entry *buf_size = &case_cfg->buf_size;
> +	struct test_configure_entry *ring_size = &case_cfg->ring_size;
> +	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
> +	struct test_configure_entry *var_entry = NULL;
> +
> +	for (i = 0; i < RTE_DIM(output_str); i++)
> +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> +
> +	if (nb_lcores <= case_cfg->nb_workers) {
> +		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
> +			case_id, nb_lcores, case_cfg->nb_workers);
> +		return;
> +	}
> +
> +	RTE_LOG(INFO, DMA, "Number of used lcores: %u.\n", nb_lcores);
> +
> +	if (mem_size->incr != 0)
> +		var_entry = mem_size;
> +
> +	if (buf_size->incr != 0)
> +		var_entry = buf_size;
> +
> +	if (ring_size->incr != 0)
> +		var_entry = ring_size;
> +
> +	if (kick_batch->incr != 0)
> +		var_entry = kick_batch;
> +
> +	case_cfg->scenario_id = 0;
> +
> +	output_header(case_id, case_cfg);
> +
> +	if (var_entry) {

Things may be a bit simpler if instead of branching here, you initialize
var_entry to a null var_entry i.e.

	struct test_configure_entry dummy = { 0 };
	struct test_configure_entry *var_entry = &dummy;

This gives you a single-iteration loop in the case where there is nothing
to vary.

> +		for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
> +			case_cfg->scenario_id++;
> +			printf("\nRunning scenario %d\n", case_cfg->scenario_id);
> +
> +			run_test_case(case_cfg);
> +			output_csv(false);
> +
> +			if (var_entry->op == OP_MUL)
> +				var_entry->cur *= var_entry->incr;
> +			else
> +				var_entry->cur += var_entry->incr;
> +
> +
> +		}
> +	} else {
> +		run_test_case(case_cfg);
> +		output_csv(false);
> +	}
> +}
> +
> +static int
> +parse_entry(const char *value, struct test_configure_entry *entry)
> +{
> +	char input[255] = {0};
> +	char *args[MAX_PARAMS_PER_ENTRY];
> +	int args_nr = -1;
> +
> +	strncpy(input, value, 254);
> +	if (*input == '\0')
> +		goto out;
> +
> +	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
> +	if (args_nr <= 0)
> +		goto out;
> +
> +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> +	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
> +	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
> +
> +	if (args_nr > 3) {
> +		if (!strcmp(args[3], "MUL"))
> +			entry->op = OP_MUL;
> +		else
> +			entry->op = OP_ADD;

This means accepting invalid input. I think you should check the value
against "ADD" so as to reject values like "SUB".

> +	} else
> +		entry->op = OP_NONE;
> +out:
> +	return args_nr;
> +}
> +
> +static void
> +load_configs(void)
> +{
> +	struct rte_cfgfile *cfgfile;
> +	int nb_sections, i;
> +	struct test_configure *test_case;
> +	char **sections_name;
> +	const char *section_name, *case_type;
> +	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
> +	int args_nr, nb_vp;
> +
> +	sections_name = malloc(MAX_TEST_CASES * sizeof(char *));
> +	for (i = 0; i < MAX_TEST_CASES; i++)
> +		sections_name[i] = malloc(CFG_NAME_LEN * sizeof(char *));
> +

I don't think you need to do this work, allocating space for a bunch of
section names. From the example, it looks like the sections should be
called "case1", "case2" etc., so you can just iterate through those
sections, rather than allowing sections to have arbitrary names.

> +	cfgfile = rte_cfgfile_load("./config.ini", 0);
> +	if (!cfgfile) {
> +		printf("Open configure file error.\n");
> +		exit(1);
> +	}

Don't hard-code the config file name. This should be taken from a
commandline parameter, so that one can have collections of different test
cases.

> +
> +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> +	if (nb_sections > MAX_TEST_CASES) {
> +		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
> +		exit(1);
> +	}
> +	rte_cfgfile_sections(cfgfile, sections_name, MAX_TEST_CASES);
> +	for (i = 0; i < nb_sections; i++) {

Iterate through names here, built up dynamically to save memory space.

> +		test_case = &test_cases[i];
> +		section_name = sections_name[i];
> +		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
> +		if (!case_type) {
> +			printf("Error: No case type in case %d\n.", i + 1);
> +			exit(1);
> +		}
> +		if (!strcmp(case_type, DMA_MEM_COPY)) {

Coding standard for DPDK requires this to be "strcmp(...) == 0" rather than
using "!" operator.

> +			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
> +			test_case->test_type_str = DMA_MEM_COPY;
> +		} else if (!strcmp(case_type, CPU_MEM_COPY)) {
> +			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
> +			test_case->test_type_str = CPU_MEM_COPY;
> +		} else {
> +			printf("Error: Cannot find case type %s.\n", case_type);
> +			exit(1);
> +		}
> +
> +		nb_vp = 0;
> +
> +		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "src_numa_node"));
> +		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dst_numa_node"));
> +
> +		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
> +		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
> +		if (args_nr < 0) {
> +			printf("parse error\n");
> +			break;
> +		} else if (args_nr > 1)
> +			nb_vp++;
> +
> +		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
> +		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
> +		if (args_nr < 0) {
> +			printf("parse error\n");
> +			break;
> +		} else if (args_nr > 1)
> +			nb_vp++;
> +
> +		ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "dma_ring_size");
> +		args_nr = parse_entry(ring_size_str, &test_case->ring_size);
> +		if (args_nr < 0) {
> +			printf("parse error\n");
> +			break;
> +		} else if (args_nr > 1)
> +			nb_vp++;
> +
> +		kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
> +		args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
> +		if (args_nr < 0) {
> +			printf("parse error\n");
> +			break;
> +		} else if (args_nr > 1)
> +			nb_vp++;
> +
> +		if (nb_vp > 2) {
> +			printf("%s, variable parameters can only have one.\n", section_name);

Reword to: "Error, each section can only have a single variable parameter"
Also, comparison should be ">= 2" (or "> 1") rather than "> 2", which would
allow 2 as a valid value.

> +			break;
> +		}
> +
> +		test_case->cache_flush =
> +			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
> +		test_case->repeat_times =
> +			(uint32_t)atoi(rte_cfgfile_get_entry(cfgfile,
> +					section_name, "repeat_times"));
> +		test_case->nb_workers =
> +			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> +					section_name, "worker_threads"));
> +		test_case->mpool_iter_step =
> +			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> +					section_name, "mpool_iter_step"));
> +
> +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
> +	}
> +
> +	rte_cfgfile_close(cfgfile);
> +	for (i = 0; i < MAX_TEST_CASES; i++) {
> +		if (sections_name[i] != NULL)
> +			free(sections_name[i]);

Two points here:

1. You don't need to check for NULL before calling "free()". Free just does
nothing if passed a null pointer

2. None of these values should be NULL anyway, and you need to check the
return from the malloc call. If you *do* keep the current way of reading
sections (and I recommend you don't - see my comments above), you need to
check that each malloc succeeds or else the call to "rte_cfgfile_sections"
will try and do a strlcpy to a null pointer.

> +	}
> +	free(sections_name);
> +}
> +
> +/* Parse the argument given in the command line of the application */
> +static int
> +append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
> +{
> +	int i;
> +	char *tokens[MAX_EAL_PARAM_NB];
> +	char args[MAX_EAL_PARAM_LEN] = {0};
> +	int new_argc, token_nb;
> +
> +	new_argc = argc;
> +
> +	for (i = 0; i < argc; i++)
> +		strcpy(new_argv[i], argv[i]);

I'm not sure we have a guarantee that new_argv will be big enough, do we?
Better to use strlcpy just in case here.

> +
> +	if (eal_args) {
> +		strcpy(args, eal_args);

Use strlcpy for safety.

> +		token_nb = rte_strsplit(args, strlen(args),
> +					tokens, MAX_EAL_PARAM_NB, ' ');
> +		for (i = 0; i < token_nb; i++)
> +			strcpy(new_argv[new_argc++], tokens[i]);
> +	}
> +
> +	return new_argc;
> +}
> +
> +int
> +main(int argc __maybe_unused, char *argv[] __maybe_unused)
> +{
> +	int ret;
> +	uint32_t i, nb_lcores;
> +	pid_t cpid, wpid;
> +	int wstatus;
> +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> +	char *pargs[100];

char *pargs[MAX_EAL_PARAM_NB] ??

> +	int new_argc;
> +
> +
> +	memset(args, 0, sizeof(args));
> +	for (i = 0; i < 100; i++)

RTE_DIM(pargs)

> +		pargs[i] = args[i];
> +
> +	load_configs();
> +	fd = fopen("./test_result.csv", "w");

Like the config file, the result output file should be configurable.
Perhaps it should be based off the config file name?

	test1.ini => test1_result.csv
	config.ini => config_result.csv

> +	if (!fd) {
> +		printf("Open output CSV file error.\n");
> +		return 0;
> +	}
> +	fclose(fd);
> +
> +	/* loop each case, run it */
> +	for (i = 0; i < MAX_TEST_CASES; i++) {
> +		if (test_cases[i].test_type != TEST_TYPE_NONE) {

Flip this condition to reduce indentation:

	if (test_cases[i].test_type == TEST_TYPE_NONE)
		continue;

> +			cpid = fork();
> +			if (cpid < 0) {
> +				printf("Fork case %d failed.\n", i + 1);
> +				exit(EXIT_FAILURE);
> +			} else if (cpid == 0) {
> +				printf("\nRunning case %u\n", i + 1);
> +
> +				if (test_cases[i].eal_args) {
> +					new_argc = append_eal_args(argc, argv,
> +						test_cases[i].eal_args, pargs);
> +
> +					ret = rte_eal_init(new_argc, pargs);
> +				} else {

You don't need this if-else here. The append_eal_args function handles a
NULL parameter, so unconditionally call append_eal_args and then
eal_init(new_argc, pargs). We won't notice the different in init time, but
the code would be clearer.

> +					ret = rte_eal_init(argc, argv);
> +				}
> +				if (ret < 0)
> +					rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
> +
> +				/* Check lcores. */
> +				nb_lcores = rte_lcore_count();
> +				if (nb_lcores < 2)
> +					rte_exit(EXIT_FAILURE,
> +						"There should be at least 2 worker lcores.\n");
> +
> +				fd = fopen("./test_result.csv", "a");
> +				if (!fd) {
> +					printf("Open output CSV file error.\n");
> +					return 0;
> +				}
> +
> +				if (i == 0)
> +					output_env_info();

Beware that you have a condition above to skip any test cases where
"test_type == TEST_TYPE_NONE". Therefore, if the first test is of type
NONE, the env_info will never be printed.

> +				run_test(i + 1, &test_cases[i]);
> +
> +				/* clean up the EAL */
> +				rte_eal_cleanup();
> +
> +				fclose(fd);
> +
> +				printf("\nCase %u completed.\n", i + 1);
> +
> +				exit(EXIT_SUCCESS);
> +			} else {
> +				wpid = waitpid(cpid, &wstatus, 0);
> +				if (wpid == -1) {
> +					printf("waitpid error.\n");
> +					exit(EXIT_FAILURE);
> +				}
> +
> +				if (WIFEXITED(wstatus))
> +					printf("Case process exited. status %d\n",
> +						WEXITSTATUS(wstatus));
> +				else if (WIFSIGNALED(wstatus))
> +					printf("Case process killed by signal %d\n",
> +						WTERMSIG(wstatus));
> +				else if (WIFSTOPPED(wstatus))
> +					printf("Case process stopped by signal %d\n",
> +						WSTOPSIG(wstatus));
> +				else if (WIFCONTINUED(wstatus))
> +					printf("Case process continued.\n");
> +				else
> +					printf("Case process unknown terminated.\n");
> +			}
> +		}
> +	}
> +
> +	printf("Bye...\n");
> +	return 0;
> +}
> +
> +#endif
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> new file mode 100644
> index 0000000000..a8fcf4f34d
> --- /dev/null
> +++ b/app/test-dma-perf/main.h
> @@ -0,0 +1,57 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#ifndef _MAIN_H_
> +#define _MAIN_H_
> +
> +
> +#include <rte_common.h>
> +#include <rte_cycles.h>
> +
> +#ifndef __maybe_unused
> +#define __maybe_unused	__rte_unused
> +#endif
> +
> +#define MAX_WORKER_NB 128
> +#define MAX_OUTPUT_STR_LEN 512
> +
> +#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
> +

While there are a number of RTE_LOG calls in the app, I also see a number
of regular printfs for output. Again, please standardize on one output - if
using just printf, you can drop this line.

> +extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> +
> +typedef enum {
> +	OP_NONE = 0,
> +	OP_ADD,
> +	OP_MUL
> +} alg_op_type;
> +
> +struct test_configure_entry {
> +	uint32_t first;
> +	uint32_t last;
> +	uint32_t incr;
> +	alg_op_type op;
> +	uint32_t cur;
> +};
> +
> +struct test_configure {
> +	uint8_t test_type;
> +	const char *test_type_str;
> +	uint16_t src_numa_node;
> +	uint16_t dst_numa_node;
> +	uint16_t opcode;
> +	bool is_dma;
> +	struct test_configure_entry mem_size;
> +	struct test_configure_entry buf_size;
> +	struct test_configure_entry ring_size;
> +	struct test_configure_entry kick_batch;
> +	uint32_t cache_flush;
> +	uint32_t nr_buf;
> +	uint32_t repeat_times;
> +	uint32_t nb_workers;
> +	uint16_t mpool_iter_step;
> +	const char *eal_args;
> +	uint8_t scenario_id;
> +};
> +
> +#endif /* _MAIN_H_ */
> diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
> new file mode 100644
> index 0000000000..001f67f6c1
> --- /dev/null
> +++ b/app/test-dma-perf/meson.build
> @@ -0,0 +1,20 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2019-2022 Intel Corporation

2023 now

> +
> +# meson file, for building this example as part of a main DPDK build.
> +#
> +# To build this example as a standalone application with an already-installed
> +# DPDK instance, use 'make'

Drop this comment. The test apps in "app" folder are for building as part
of DPDK only, there is no makefile to use.

> +
> +if is_windows
> +    build = false
> +    reason = 'not supported on Windows'
> +    subdir_done()
> +endif
> +
> +deps += ['dmadev', 'mbuf', 'cfgfile']
> +
> +sources = files(
> +        'main.c',
> +        'benchmark.c',
> +)
> --
> 2.35.1
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-17 12:05 ` [PATCH v3] " Cheng Jiang
  2023-01-17 15:44   ` Bruce Richardson
@ 2023-01-17 16:51   ` Bruce Richardson
  2023-01-28 13:32     ` Jiang, Cheng1
  2023-01-31  5:27     ` Hu, Jiayu
  1 sibling, 2 replies; 15+ messages in thread
From: Bruce Richardson @ 2023-01-17 16:51 UTC (permalink / raw)
  To: Cheng Jiang
  Cc: thomas, mb, dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang,
	xingguang.he

On Tue, Jan 17, 2023 at 12:05:26PM +0000, Cheng Jiang wrote:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---

More input based off trying running the application, including some
thoughts on the testing methodology below.


> +static void
> +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
> +			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> +			float bandwidth, uint64_t ops, bool is_dma)
> +{
> +	if (is_dma)
> +		printf("lcore %u, DMA %u:\n"
> +				"average cycles: %" PRIu64 ","
> +				" buffer size: %u, nr_buf: %u,"
> +				" memory: %uMB, frequency: %" PRIu64 ".\n",
> +				lcore_id,
> +				dev_id,
> +				ave_cycle,
> +				buf_size,
> +				nr_buf,
> +				memory,
> +				rte_get_timer_hz());
> +	else
> +		printf("lcore %u\n"
> +			"average cycles: %" PRIu64 ","
> +			" buffer size: %u, nr_buf: %u,"
> +			" memory: %uMB, frequency: %" PRIu64 ".\n",
> +			lcore_id,
> +			ave_cycle,
> +			buf_size,
> +			nr_buf,
> +			memory,
> +			rte_get_timer_hz());
> +

The term "average cycles" is unclear here - is it average cycles per test
iteration, or average cycles per buffer copy?


> +	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, ops);
> +

<snip>

> +
> +static inline void
> +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, uint32_t buf_size,
> +			uint16_t mpool_iter_step, struct rte_mbuf **srcs, struct rte_mbuf **dsts)
> +{
> +	int64_t async_cnt = 0;
> +	int nr_cpl = 0;
> +	uint32_t index;
> +	uint16_t offset;
> +	uint32_t i;
> +
> +	for (offset = 0; offset < mpool_iter_step; offset++) {
> +		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf; i++) {
> +			if (unlikely(rte_dma_copy(dev_id,
> +						0,
> +						srcs[index]->buf_iova + srcs[index]->data_off,
> +						dsts[index]->buf_iova + dsts[index]->data_off,
> +						buf_size,
> +						0) < 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
> +					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
> +								NULL, NULL);
> +					async_cnt -= nr_cpl;
> +				}
> +				if (rte_dma_copy(dev_id,
> +						0,
> +						srcs[index]->buf_iova + srcs[index]->data_off,
> +						dsts[index]->buf_iova + dsts[index]->data_off,
> +						buf_size,
> +						0) < 0) {
> +					printf("enqueue fail again at %u\n", index);
> +					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
> +					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
> +				}
> +			}
> +			async_cnt++;
> +
> +			/**
> +			 * When '&' is used to wrap an index, mask must be a power of 2.
> +			 * That is, kick_batch must be 2^n.
> +			 */
> +			if (unlikely((async_cnt % kick_batch) == 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				/* add a poll to avoid ring full */
> +				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +				async_cnt -= nr_cpl;
> +			}
> +		}
> +
> +		rte_dma_submit(dev_id, 0);
> +		while (async_cnt > 0) {
> +			nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +			async_cnt -= nr_cpl;
> +		}

I have a couple of concerns about the methodology for testing the HW DMA
performance. For example, the inclusion of that final block means that we
are including the latency of the copy operation in the result.

If the objective of the test application is to determine if it is cheaper
for software to offload a copy operation to HW or do it in SW, then the
primary concern is the HW offload cost. That offload cost should remain
constant irrespective of the size of the copy - since all you are doing is
writing a descriptor and reading a completion result. However, seeing the
results of running the app, I notice that the reported average cycles
increases as the packet size increases, which would tend to indicate that
we are not giving a realistic measurement of offload cost.

The trouble then becomes how to do so in a more realistic manner. The most
accurate way I can think of in a unit test like this is to offload
<queue_size> entries to the device and measure the cycles taken there. Then
wait until such time as all copies are completed (to eliminate the latency
time, which in a real-world case would be spent by a core doing something
else), and then do a second measurement of the time taken to process all
the completions. In the same way as for a SW copy, any time not spent in
memcpy is not copy time, for HW copies any time spent not writing
descriptors or reading completions is not part of the offload cost.

That said, doing the above is still not fully realistic, as a real-world
app will likely still have some amount of other overhead, for example,
polling occasionally for completions in between doing other work (though
one would expect this to be relatively cheap).  Similarly, if the
submission queue fills, the app may have to delay waiting for space to
submit jobs, and therefore see some of the HW copy latency.

Therefore, I think the most realistic way to measure this is to look at the
rate of operations while processing is being done in the middle of the
test. For example, if we have a simple packet processing application,
running the application just doing RX and TX and measuring the rate allows
us to determine the basic packet I/O cost. Adding in an offload to HW for
each packet and again measuring the rate, will then allow us to compute the
true offload copy cost of the operation, and should give us a number that
remains flat even as packet size increases. For previous work done on vhost
with DMA acceleration, I believe we saw exactly that - while SW PPS reduced
as packet size increased, with HW copies the PPS remained constant even as
packet size increased.

The challenge to my mind, is therefore how to implement this in a suitable
unit-test style way, to fit into the framework you have given here. I would
suggest that the actual performance measurement needs to be done - not on a
total time - but on a fixed time basis within each test. For example, when
doing HW copies, 1ms into each test run, we need to snapshot the completed
entries, and then say 1ms later measure the number that have been completed
since. In this way, we avoid the initial startup latency while we wait for
jobs to start completing, and we avoid the final latency as we await the
last job to complete. We would also include time for some potentially empty
polls, and if a queue size is too small see that reflected in the
performance too.

Thoughts, input from others?

/Bruce

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH v2] app/dma-perf: introduce dma-perf application
  2023-01-17 14:03       ` Bruce Richardson
@ 2023-01-18  1:46         ` Jiang, Cheng1
  0 siblings, 0 replies; 15+ messages in thread
From: Jiang, Cheng1 @ 2023-01-18  1:46 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

Hi Bruce,

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Tuesday, January 17, 2023 10:04 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org; Hu,
> Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; He,
> Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v2] app/dma-perf: introduce dma-perf application
> 
> On Tue, Jan 17, 2023 at 01:54:50PM +0000, Jiang, Cheng1 wrote:
> > Hi Bruce,
> >
> > Thanks for your comments.
> > Replies are inline. I'll fix them in the next version.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Richardson, Bruce <bruce.richardson@intel.com>
> > > Sent: Tuesday, January 17, 2023 9:00 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org;
> Hu,
> > > Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma,
> > > WenwuX <wenwux.ma@intel.com>; Wang, YuanX <yuanx.wang@intel.com>;
> > > He, Xingguang <xingguang.he@intel.com>
> > > Subject: Re: [PATCH v2] app/dma-perf: introduce dma-perf application
> > >
> > > On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote:
> 
> <snip>
> 
> > > > +#if !defined(RTE_EXEC_ENV_LINUX)
> > > > +
> > > > +int
> > > > +main(int argc, char *argv[])
> > > > +{
> > > > +	printf("OS not supported, skipping test\n");
> > > > +	return 0;
> > > > +}
> > > > +
> > >
> > > What is linux-specific about this app?
> > >
> > > If we do need to limit the app to Linux-only I suggest using meson
> > > to do so rather than putting #ifdefs in the code.
> >
> > Got it. Thanks!
> >
> Can you maybe clarify what is linux-specific about this app, or else perhaps test
> at least building on FreeBSD and windows. I'd rather not see stuff added just for
> Linux without some attempt to try on other OS's.
> 
> /Bruce

In fact, there is no special reason to only support Linux. I will remove this restriction. Thanks!

Cheng


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-17 15:44   ` Bruce Richardson
@ 2023-01-19  7:18     ` Jiang, Cheng1
  0 siblings, 0 replies; 15+ messages in thread
From: Jiang, Cheng1 @ 2023-01-19  7:18 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

Hi Bruce,

Replies are inline.
Really appreciate your comments.
I'll fix it in the next version.

Thanks,
Cheng

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Tuesday, January 17, 2023 11:44 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org; Hu,
> Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; He,
> Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v3] app/dma-perf: introduce dma-perf application
> 
> On Tue, Jan 17, 2023 at 12:05:26PM +0000, Cheng Jiang wrote:
> > There are many high-performance DMA devices supported in DPDK now, and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre-set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> 
> Hi,
> 
> more review comments inline below.
> 
> /Bruce
> 
> > ---
> 
> <snip>
> 
> > eal_args=--legacy-mem --file-prefix=test
> 
> Why using legact-mem mode? Rather than these options, just use "--in-memory"
> to avoid any conflicts. While this is only an example config, we should steer
> people away from legacy memory mode.

Ok, got it. I'll fix it.

> 
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new
> > file mode 100644 index 0000000000..8041f5fdaf
> > --- /dev/null
> > +++ b/app/test-dma-perf/main.c
> > @@ -0,0 +1,434 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation  */
> > +
> > +#include <stdio.h>
> > +#if !defined(RTE_EXEC_ENV_LINUX)
> > +
> > +int
> > +main(int argc, char *argv[])
> > +{
> > +	printf("OS not supported, skipping test\n");
> > +	return 0;
> > +}
> > +
> > +#else
> > +
> > +#include <stdlib.h>
> > +#include <getopt.h>
> > +#include <signal.h>
> > +#include <stdbool.h>
> > +#include <unistd.h>
> > +#include <sys/wait.h>
> > +#include <inttypes.h>
> > +
> > +#include <rte_eal.h>
> > +#include <rte_cfgfile.h>
> > +#include <rte_string_fns.h>
> > +#include <rte_lcore.h>
> > +
> > +#include "main.h"
> > +#include "benchmark.h"
> > +
> > +#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer
> size,nr_buf,memory(MB),cycle,bandwidth(Gbps),OPS\n"
> > +
> > +#define MAX_EAL_PARAM_NB 100
> > +#define MAX_EAL_PARAM_LEN 1024
> > +
> > +#define DMA_MEM_COPY "DMA_MEM_COPY"
> > +#define CPU_MEM_COPY "CPU_MEM_COPY"
> > +
> > +#define MAX_PARAMS_PER_ENTRY 4
> > +
> > +enum {
> > +	TEST_TYPE_NONE = 0,
> > +	TEST_TYPE_DMA_MEM_COPY,
> > +	TEST_TYPE_CPU_MEM_COPY
> > +};
> > +
> > +#define MAX_TEST_CASES 16
> > +static struct test_configure test_cases[MAX_TEST_CASES];
> > +
> > +char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > +
> > +static FILE *fd;
> > +
> > +static void
> > +output_csv(bool need_blankline)
> > +{
> > +	uint32_t i;
> > +
> > +	if (need_blankline) {
> > +		fprintf(fd, "%s", ",,,,,,,,\n");
> > +		fprintf(fd, "%s", ",,,,,,,,\n");
> you don't need the "%s" here. The string you are outputting is constant.

Sure, sorry about that.

> > +	}
> > +
> > +	for (i = 0; i < RTE_DIM(output_str); i++) {
> > +		if (output_str[i][0]) {
> > +			fprintf(fd, "%s", output_str[i]);
> > +			memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> 
> Rather than zeroing the whole string with memset, would "output_str[i][0] =
> '\0';" not work instead?

Good point. I'll try it in the next version. 

> 
> > +		}
> > +	}
> > +
> > +	fflush(fd);
> > +}
> > +
> > +static void
> > +output_env_info(void)
> > +{
> > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
> > +	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64
> > +"\n", rte_get_timer_hz());
> > +
> > +	output_csv(true);
> > +}
> > +
> > +static void
> > +output_header(uint32_t case_id, struct test_configure *case_cfg) {
> > +	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
> > +			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
> > +
> > +	output_csv(true);
> > +}
> > +
> > +static void
> > +run_test_case(struct test_configure *case_cfg) {
> > +	switch (case_cfg->test_type) {
> > +	case TEST_TYPE_DMA_MEM_COPY:
> > +		dma_mem_copy_benchmark(case_cfg);
> > +		break;
> > +	case TEST_TYPE_CPU_MEM_COPY:
> > +		cpu_mem_copy_benchmark(case_cfg);
> > +		break;
> > +	default:
> > +		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> > +		break;
> > +	}
> > +}
> > +
> > +static void
> > +run_test(uint32_t case_id, struct test_configure *case_cfg) {
> > +	uint32_t i;
> > +	uint32_t nb_lcores = rte_lcore_count();
> > +	struct test_configure_entry *mem_size = &case_cfg->mem_size;
> > +	struct test_configure_entry *buf_size = &case_cfg->buf_size;
> > +	struct test_configure_entry *ring_size = &case_cfg->ring_size;
> > +	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
> > +	struct test_configure_entry *var_entry = NULL;
> > +
> > +	for (i = 0; i < RTE_DIM(output_str); i++)
> > +		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
> > +
> > +	if (nb_lcores <= case_cfg->nb_workers) {
> > +		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
> > +			case_id, nb_lcores, case_cfg->nb_workers);
> > +		return;
> > +	}
> > +
> > +	RTE_LOG(INFO, DMA, "Number of used lcores: %u.\n", nb_lcores);
> > +
> > +	if (mem_size->incr != 0)
> > +		var_entry = mem_size;
> > +
> > +	if (buf_size->incr != 0)
> > +		var_entry = buf_size;
> > +
> > +	if (ring_size->incr != 0)
> > +		var_entry = ring_size;
> > +
> > +	if (kick_batch->incr != 0)
> > +		var_entry = kick_batch;
> > +
> > +	case_cfg->scenario_id = 0;
> > +
> > +	output_header(case_id, case_cfg);
> > +
> > +	if (var_entry) {
> 
> Things may be a bit simpler if instead of branching here, you initialize var_entry
> to a null var_entry i.e.
> 
> 	struct test_configure_entry dummy = { 0 };
> 	struct test_configure_entry *var_entry = &dummy;
> 
> This gives you a single-iteration loop in the case where there is nothing to vary.

I'll consider it, thanks!

> 
> > +		for (var_entry->cur = var_entry->first; var_entry->cur <=
> var_entry->last;) {
> > +			case_cfg->scenario_id++;
> > +			printf("\nRunning scenario %d\n", case_cfg-
> >scenario_id);
> > +
> > +			run_test_case(case_cfg);
> > +			output_csv(false);
> > +
> > +			if (var_entry->op == OP_MUL)
> > +				var_entry->cur *= var_entry->incr;
> > +			else
> > +				var_entry->cur += var_entry->incr;
> > +
> > +
> > +		}
> > +	} else {
> > +		run_test_case(case_cfg);
> > +		output_csv(false);
> > +	}
> > +}
> > +
> > +static int
> > +parse_entry(const char *value, struct test_configure_entry *entry) {
> > +	char input[255] = {0};
> > +	char *args[MAX_PARAMS_PER_ENTRY];
> > +	int args_nr = -1;
> > +
> > +	strncpy(input, value, 254);
> > +	if (*input == '\0')
> > +		goto out;
> > +
> > +	args_nr = rte_strsplit(input, strlen(input), args,
> MAX_PARAMS_PER_ENTRY, ',');
> > +	if (args_nr <= 0)
> > +		goto out;
> > +
> > +	entry->cur = entry->first = (uint32_t)atoi(args[0]);
> > +	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
> > +	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
> > +
> > +	if (args_nr > 3) {
> > +		if (!strcmp(args[3], "MUL"))
> > +			entry->op = OP_MUL;
> > +		else
> > +			entry->op = OP_ADD;
> 
> This means accepting invalid input. I think you should check the value against
> "ADD" so as to reject values like "SUB".

You are right, thanks! I'll fix it.

> 
> > +	} else
> > +		entry->op = OP_NONE;
> > +out:
> > +	return args_nr;
> > +}
> > +
> > +static void
> > +load_configs(void)
> > +{
> > +	struct rte_cfgfile *cfgfile;
> > +	int nb_sections, i;
> > +	struct test_configure *test_case;
> > +	char **sections_name;
> > +	const char *section_name, *case_type;
> > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> *kick_batch_str;
> > +	int args_nr, nb_vp;
> > +
> > +	sections_name = malloc(MAX_TEST_CASES * sizeof(char *));
> > +	for (i = 0; i < MAX_TEST_CASES; i++)
> > +		sections_name[i] = malloc(CFG_NAME_LEN * sizeof(char *));
> > +
> 
> I don't think you need to do this work, allocating space for a bunch of section
> names. From the example, it looks like the sections should be called "case1",
> "case2" etc., so you can just iterate through those sections, rather than allowing
> sections to have arbitrary names.

Good point, I'll try it in the next version.

> 
> > +	cfgfile = rte_cfgfile_load("./config.ini", 0);
> > +	if (!cfgfile) {
> > +		printf("Open configure file error.\n");
> > +		exit(1);
> > +	}
> 
> Don't hard-code the config file name. This should be taken from a commandline
> parameter, so that one can have collections of different test cases.

Yes, you are right, I'll fix it.

> 
> > +
> > +	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
> > +	if (nb_sections > MAX_TEST_CASES) {
> > +		printf("Error: The maximum number of cases is %d.\n",
> MAX_TEST_CASES);
> > +		exit(1);
> > +	}
> > +	rte_cfgfile_sections(cfgfile, sections_name, MAX_TEST_CASES);
> > +	for (i = 0; i < nb_sections; i++) {
> 
> Iterate through names here, built up dynamically to save memory space.

Sure.

> 
> > +		test_case = &test_cases[i];
> > +		section_name = sections_name[i];
> > +		case_type = rte_cfgfile_get_entry(cfgfile, section_name,
> "type");
> > +		if (!case_type) {
> > +			printf("Error: No case type in case %d\n.", i + 1);
> > +			exit(1);
> > +		}
> > +		if (!strcmp(case_type, DMA_MEM_COPY)) {
> 
> Coding standard for DPDK requires this to be "strcmp(...) == 0" rather than using
> "!" operator.

OK, got it.

> 
> > +			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
> > +			test_case->test_type_str = DMA_MEM_COPY;
> > +		} else if (!strcmp(case_type, CPU_MEM_COPY)) {
> > +			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
> > +			test_case->test_type_str = CPU_MEM_COPY;
> > +		} else {
> > +			printf("Error: Cannot find case type %s.\n", case_type);
> > +			exit(1);
> > +		}
> > +
> > +		nb_vp = 0;
> > +
> > +		test_case->src_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +								section_name,
> "src_numa_node"));
> > +		test_case->dst_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +								section_name,
> "dst_numa_node"));
> > +
> > +		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> "mem_size");
> > +		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
> > +		if (args_nr < 0) {
> > +			printf("parse error\n");
> > +			break;
> > +		} else if (args_nr > 1)
> > +			nb_vp++;
> > +
> > +		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> "buf_size");
> > +		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
> > +		if (args_nr < 0) {
> > +			printf("parse error\n");
> > +			break;
> > +		} else if (args_nr > 1)
> > +			nb_vp++;
> > +
> > +		ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
> "dma_ring_size");
> > +		args_nr = parse_entry(ring_size_str, &test_case->ring_size);
> > +		if (args_nr < 0) {
> > +			printf("parse error\n");
> > +			break;
> > +		} else if (args_nr > 1)
> > +			nb_vp++;
> > +
> > +		kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name,
> "kick_batch");
> > +		args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
> > +		if (args_nr < 0) {
> > +			printf("parse error\n");
> > +			break;
> > +		} else if (args_nr > 1)
> > +			nb_vp++;
> > +
> > +		if (nb_vp > 2) {
> > +			printf("%s, variable parameters can only have one.\n",
> > +section_name);
> 
> Reword to: "Error, each section can only have a single variable parameter"
> Also, comparison should be ">= 2" (or "> 1") rather than "> 2", which would
> allow 2 as a valid value.

Sure, thanks.

> 
> > +			break;
> > +		}
> > +
> > +		test_case->cache_flush =
> > +			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name,
> "cache_flush"));
> > +		test_case->repeat_times =
> > +			(uint32_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > +					section_name, "repeat_times"));
> > +		test_case->nb_workers =
> > +			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > +					section_name, "worker_threads"));
> > +		test_case->mpool_iter_step =
> > +			(uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
> > +					section_name, "mpool_iter_step"));
> > +
> > +		test_case->eal_args = rte_cfgfile_get_entry(cfgfile,
> section_name, "eal_args");
> > +	}
> > +
> > +	rte_cfgfile_close(cfgfile);
> > +	for (i = 0; i < MAX_TEST_CASES; i++) {
> > +		if (sections_name[i] != NULL)
> > +			free(sections_name[i]);
> 
> Two points here:
> 
> 1. You don't need to check for NULL before calling "free()". Free just does
> nothing if passed a null pointer
> 
> 2. None of these values should be NULL anyway, and you need to check the
> return from the malloc call. If you *do* keep the current way of reading sections
> (and I recommend you don't - see my comments above), you need to check that
> each malloc succeeds or else the call to "rte_cfgfile_sections"
> will try and do a strlcpy to a null pointer.

Sure, got it!

> 
> > +	}
> > +	free(sections_name);
> > +}
> > +
> > +/* Parse the argument given in the command line of the application */
> > +static int append_eal_args(int argc, char **argv, const char
> > +*eal_args, char **new_argv) {
> > +	int i;
> > +	char *tokens[MAX_EAL_PARAM_NB];
> > +	char args[MAX_EAL_PARAM_LEN] = {0};
> > +	int new_argc, token_nb;
> > +
> > +	new_argc = argc;
> > +
> > +	for (i = 0; i < argc; i++)
> > +		strcpy(new_argv[i], argv[i]);
> 
> I'm not sure we have a guarantee that new_argv will be big enough, do we?
> Better to use strlcpy just in case here.

I agree with you, we don't have a guarantee for that. I'll fix it.

> 
> > +
> > +	if (eal_args) {
> > +		strcpy(args, eal_args);
> 
> Use strlcpy for safety.

Sure.

> 
> > +		token_nb = rte_strsplit(args, strlen(args),
> > +					tokens, MAX_EAL_PARAM_NB, ' ');
> > +		for (i = 0; i < token_nb; i++)
> > +			strcpy(new_argv[new_argc++], tokens[i]);
> > +	}
> > +
> > +	return new_argc;
> > +}
> > +
> > +int
> > +main(int argc __maybe_unused, char *argv[] __maybe_unused) {
> > +	int ret;
> > +	uint32_t i, nb_lcores;
> > +	pid_t cpid, wpid;
> > +	int wstatus;
> > +	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
> > +	char *pargs[100];
> 
> char *pargs[MAX_EAL_PARAM_NB] ??

Sure, sorry about that.

> 
> > +	int new_argc;
> > +
> > +
> > +	memset(args, 0, sizeof(args));
> > +	for (i = 0; i < 100; i++)
> 
> RTE_DIM(pargs)

Sure.

> 
> > +		pargs[i] = args[i];
> > +
> > +	load_configs();
> > +	fd = fopen("./test_result.csv", "w");
> 
> Like the config file, the result output file should be configurable.
> Perhaps it should be based off the config file name?
> 
> 	test1.ini => test1_result.csv
> 	config.ini => config_result.csv

Good point, thanks!

> 
> > +	if (!fd) {
> > +		printf("Open output CSV file error.\n");
> > +		return 0;
> > +	}
> > +	fclose(fd);
> > +
> > +	/* loop each case, run it */
> > +	for (i = 0; i < MAX_TEST_CASES; i++) {
> > +		if (test_cases[i].test_type != TEST_TYPE_NONE) {
> 
> Flip this condition to reduce indentation:
> 
> 	if (test_cases[i].test_type == TEST_TYPE_NONE)
> 		continue;

Sure.

> 
> > +			cpid = fork();
> > +			if (cpid < 0) {
> > +				printf("Fork case %d failed.\n", i + 1);
> > +				exit(EXIT_FAILURE);
> > +			} else if (cpid == 0) {
> > +				printf("\nRunning case %u\n", i + 1);
> > +
> > +				if (test_cases[i].eal_args) {
> > +					new_argc = append_eal_args(argc,
> argv,
> > +						test_cases[i].eal_args, pargs);
> > +
> > +					ret = rte_eal_init(new_argc, pargs);
> > +				} else {
> 
> You don't need this if-else here. The append_eal_args function handles a NULL
> parameter, so unconditionally call append_eal_args and then eal_init(new_argc,
> pargs). We won't notice the different in init time, but the code would be clearer.

OK, got it.

> 
> > +					ret = rte_eal_init(argc, argv);
> > +				}
> > +				if (ret < 0)
> > +					rte_exit(EXIT_FAILURE, "Invalid EAL
> arguments\n");
> > +
> > +				/* Check lcores. */
> > +				nb_lcores = rte_lcore_count();
> > +				if (nb_lcores < 2)
> > +					rte_exit(EXIT_FAILURE,
> > +						"There should be at least 2
> worker lcores.\n");
> > +
> > +				fd = fopen("./test_result.csv", "a");
> > +				if (!fd) {
> > +					printf("Open output CSV file error.\n");
> > +					return 0;
> > +				}
> > +
> > +				if (i == 0)
> > +					output_env_info();
> 
> Beware that you have a condition above to skip any test cases where "test_type
> == TEST_TYPE_NONE". Therefore, if the first test is of type NONE, the env_info
> will never be printed.

I'll fix it in the next version.

> 
> > +				run_test(i + 1, &test_cases[i]);
> > +
> > +				/* clean up the EAL */
> > +				rte_eal_cleanup();
> > +
> > +				fclose(fd);
> > +
> > +				printf("\nCase %u completed.\n", i + 1);
> > +
> > +				exit(EXIT_SUCCESS);
> > +			} else {
> > +				wpid = waitpid(cpid, &wstatus, 0);
> > +				if (wpid == -1) {
> > +					printf("waitpid error.\n");
> > +					exit(EXIT_FAILURE);
> > +				}
> > +
> > +				if (WIFEXITED(wstatus))
> > +					printf("Case process exited.
> status %d\n",
> > +						WEXITSTATUS(wstatus));
> > +				else if (WIFSIGNALED(wstatus))
> > +					printf("Case process killed by
> signal %d\n",
> > +						WTERMSIG(wstatus));
> > +				else if (WIFSTOPPED(wstatus))
> > +					printf("Case process stopped by
> signal %d\n",
> > +						WSTOPSIG(wstatus));
> > +				else if (WIFCONTINUED(wstatus))
> > +					printf("Case process continued.\n");
> > +				else
> > +					printf("Case process unknown
> terminated.\n");
> > +			}
> > +		}
> > +	}
> > +
> > +	printf("Bye...\n");
> > +	return 0;
> > +}
> > +
> > +#endif
> > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h new
> > file mode 100644 index 0000000000..a8fcf4f34d
> > --- /dev/null
> > +++ b/app/test-dma-perf/main.h
> > @@ -0,0 +1,57 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation  */
> > +
> > +#ifndef _MAIN_H_
> > +#define _MAIN_H_
> > +
> > +
> > +#include <rte_common.h>
> > +#include <rte_cycles.h>
> > +
> > +#ifndef __maybe_unused
> > +#define __maybe_unused	__rte_unused
> > +#endif
> > +
> > +#define MAX_WORKER_NB 128
> > +#define MAX_OUTPUT_STR_LEN 512
> > +
> > +#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
> > +
> 
> While there are a number of RTE_LOG calls in the app, I also see a number of
> regular printfs for output. Again, please standardize on one output - if using just
> printf, you can drop this line.

Sure, I'll fix it.

> 
> > +extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
> > +
> > +typedef enum {
> > +	OP_NONE = 0,
> > +	OP_ADD,
> > +	OP_MUL
> > +} alg_op_type;
> > +
> > +struct test_configure_entry {
> > +	uint32_t first;
> > +	uint32_t last;
> > +	uint32_t incr;
> > +	alg_op_type op;
> > +	uint32_t cur;
> > +};
> > +
> > +struct test_configure {
> > +	uint8_t test_type;
> > +	const char *test_type_str;
> > +	uint16_t src_numa_node;
> > +	uint16_t dst_numa_node;
> > +	uint16_t opcode;
> > +	bool is_dma;
> > +	struct test_configure_entry mem_size;
> > +	struct test_configure_entry buf_size;
> > +	struct test_configure_entry ring_size;
> > +	struct test_configure_entry kick_batch;
> > +	uint32_t cache_flush;
> > +	uint32_t nr_buf;
> > +	uint32_t repeat_times;
> > +	uint32_t nb_workers;
> > +	uint16_t mpool_iter_step;
> > +	const char *eal_args;
> > +	uint8_t scenario_id;
> > +};
> > +
> > +#endif /* _MAIN_H_ */
> > diff --git a/app/test-dma-perf/meson.build
> > b/app/test-dma-perf/meson.build new file mode 100644 index
> > 0000000000..001f67f6c1
> > --- /dev/null
> > +++ b/app/test-dma-perf/meson.build
> > @@ -0,0 +1,20 @@
> > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2022
> > +Intel Corporation
> 
> 2023 now

Got it.

> 
> > +
> > +# meson file, for building this example as part of a main DPDK build.
> > +#
> > +# To build this example as a standalone application with an
> > +already-installed # DPDK instance, use 'make'
> 
> Drop this comment. The test apps in "app" folder are for building as part of
> DPDK only, there is no makefile to use.

Sure.

> 
> > +
> > +if is_windows
> > +    build = false
> > +    reason = 'not supported on Windows'
> > +    subdir_done()
> > +endif
> > +
> > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > +
> > +sources = files(
> > +        'main.c',
> > +        'benchmark.c',
> > +)
> > --
> > 2.35.1
> >

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-17 16:51   ` Bruce Richardson
@ 2023-01-28 13:32     ` Jiang, Cheng1
  2023-01-30  9:20       ` Bruce Richardson
  2023-01-31  5:27     ` Hu, Jiayu
  1 sibling, 1 reply; 15+ messages in thread
From: Jiang, Cheng1 @ 2023-01-28 13:32 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

Hi Bruce,

Sorry for the late reply. We are in the Spring Festival holiday last week.
Thanks for your comments.
Replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Wednesday, January 18, 2023 12:52 AM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org; Hu,
> Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma, WenwuX
> <wenwux.ma@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; He,
> Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v3] app/dma-perf: introduce dma-perf application
> 
> On Tue, Jan 17, 2023 at 12:05:26PM +0000, Cheng Jiang wrote:
> > There are many high-performance DMA devices supported in DPDK now, and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre-set config file. Memory Copy performance test are
> supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> 
> More input based off trying running the application, including some thoughts on
> the testing methodology below.
> 
> 
> > +static void
> > +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id,
> uint64_t ave_cycle,
> > +			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> > +			float bandwidth, uint64_t ops, bool is_dma) {
> > +	if (is_dma)
> > +		printf("lcore %u, DMA %u:\n"
> > +				"average cycles: %" PRIu64 ","
> > +				" buffer size: %u, nr_buf: %u,"
> > +				" memory: %uMB, frequency: %" PRIu64 ".\n",
> > +				lcore_id,
> > +				dev_id,
> > +				ave_cycle,
> > +				buf_size,
> > +				nr_buf,
> > +				memory,
> > +				rte_get_timer_hz());
> > +	else
> > +		printf("lcore %u\n"
> > +			"average cycles: %" PRIu64 ","
> > +			" buffer size: %u, nr_buf: %u,"
> > +			" memory: %uMB, frequency: %" PRIu64 ".\n",
> > +			lcore_id,
> > +			ave_cycle,
> > +			buf_size,
> > +			nr_buf,
> > +			memory,
> > +			rte_get_timer_hz());
> > +
> 
> The term "average cycles" is unclear here - is it average cycles per test iteration,
> or average cycles per buffer copy?

The average cycles stands for average cycles per buffer copy, I'll clarify it in the next version.

> 
> 
> > +	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n",
> > +bandwidth, ops);
> > +
> 
> <snip>
> 
> > +
> > +static inline void
> > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch,
> uint32_t buf_size,
> > +			uint16_t mpool_iter_step, struct rte_mbuf **srcs,
> struct rte_mbuf
> > +**dsts) {
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t index;
> > +	uint16_t offset;
> > +	uint32_t i;
> > +
> > +	for (offset = 0; offset < mpool_iter_step; offset++) {
> > +		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf;
> i++) {
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						srcs[index]->buf_iova +
> srcs[index]->data_off,
> > +						dsts[index]->buf_iova +
> dsts[index]->data_off,
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) == 0)
> {
> > +					nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						srcs[index]->buf_iova +
> srcs[index]->data_off,
> > +						dsts[index]->buf_iova +
> dsts[index]->data_off,
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> index);
> > +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA enqueue
> failed\n");
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			/**
> > +			 * When '&' is used to wrap an index, mask must be a
> power of 2.
> > +			 * That is, kick_batch must be 2^n.
> > +			 */
> > +			if (unlikely((async_cnt % kick_batch) == 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +			}
> > +		}
> > +
> > +		rte_dma_submit(dev_id, 0);
> > +		while (async_cnt > 0) {
> > +			nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> > +			async_cnt -= nr_cpl;
> > +		}
> 
> I have a couple of concerns about the methodology for testing the HW DMA
> performance. For example, the inclusion of that final block means that we are
> including the latency of the copy operation in the result.
> 
> If the objective of the test application is to determine if it is cheaper for
> software to offload a copy operation to HW or do it in SW, then the primary
> concern is the HW offload cost. That offload cost should remain constant
> irrespective of the size of the copy - since all you are doing is writing a descriptor
> and reading a completion result. However, seeing the results of running the app,
> I notice that the reported average cycles increases as the packet size increases,
> which would tend to indicate that we are not giving a realistic measurement of
> offload cost.

We are trying to compare the time required to complete a certain amount of
work using DMA with the time required to complete it using CPU. I think in addition
to the offload cost, the capability of the DMA itself is also an important factor to be considered.
The offload cost should be constant , but when DMA copies memory of different lengths,
the time costs are different. So the reported average cycles increases as the packet size increases.
Therefore, this test result includes both offload cost and DMA operation cost. To some extent,
it should be a relative realistic measurement result.

Do you think it makes sense to you?

> 
> The trouble then becomes how to do so in a more realistic manner. The most
> accurate way I can think of in a unit test like this is to offload <queue_size>
> entries to the device and measure the cycles taken there. Then wait until such
> time as all copies are completed (to eliminate the latency time, which in a real-
> world case would be spent by a core doing something else), and then do a
> second measurement of the time taken to process all the completions. In the
> same way as for a SW copy, any time not spent in memcpy is not copy time, for
> HW copies any time spent not writing descriptors or reading completions is not
> part of the offload cost.

Agreed, we are thinking about adding offload cost as one of test results in the future.

> 
> That said, doing the above is still not fully realistic, as a real-world app will likely
> still have some amount of other overhead, for example, polling occasionally for
> completions in between doing other work (though one would expect this to be
> relatively cheap).  Similarly, if the submission queue fills, the app may have to
> delay waiting for space to submit jobs, and therefore see some of the HW copy
> latency.
> 
> Therefore, I think the most realistic way to measure this is to look at the rate of
> operations while processing is being done in the middle of the test. For example,
> if we have a simple packet processing application, running the application just
> doing RX and TX and measuring the rate allows us to determine the basic packet
> I/O cost. Adding in an offload to HW for each packet and again measuring the
> rate, will then allow us to compute the true offload copy cost of the operation,
> and should give us a number that remains flat even as packet size increases. For
> previous work done on vhost with DMA acceleration, I believe we saw exactly
> that - while SW PPS reduced as packet size increased, with HW copies the PPS
> remained constant even as packet size increased.
> 
> The challenge to my mind, is therefore how to implement this in a suitable unit-
> test style way, to fit into the framework you have given here. I would suggest
> that the actual performance measurement needs to be done - not on a total
> time - but on a fixed time basis within each test. For example, when doing HW
> copies, 1ms into each test run, we need to snapshot the completed entries, and
> then say 1ms later measure the number that have been completed since. In this
> way, we avoid the initial startup latency while we wait for jobs to start
> completing, and we avoid the final latency as we await the last job to complete.
> We would also include time for some potentially empty polls, and if a queue size
> is too small see that reflected in the performance too.

I understand your concerns, but I think maybe we are not discussing the same performance number here.
We are trying to test the maximum bandwidth of DMA, and what you said is how to measure the offload cost more accurately if I understand it correctly.
I think these two performance data are both important. Maybe we can add your test methodology as one of performance aspect for
DMA in the future, I need to reconsider it and get back to you later.

Thanks a lot,
Cheng

> 
> Thoughts, input from others?
> 
> /Bruce

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-28 13:32     ` Jiang, Cheng1
@ 2023-01-30  9:20       ` Bruce Richardson
  2023-02-06 14:20         ` Jiang, Cheng1
  0 siblings, 1 reply; 15+ messages in thread
From: Bruce Richardson @ 2023-01-30  9:20 UTC (permalink / raw)
  To: Jiang, Cheng1
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

On Sat, Jan 28, 2023 at 01:32:05PM +0000, Jiang, Cheng1 wrote:
> Hi Bruce,
> 
> Sorry for the late reply. We are in the Spring Festival holiday last week.
> Thanks for your comments.
> Replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Richardson, Bruce <bruce.richardson@intel.com>
> > Sent: Wednesday, January 18, 2023 12:52 AM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org; Hu,
> > Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma, WenwuX
> > <wenwux.ma@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; He,
> > Xingguang <xingguang.he@intel.com>
> > Subject: Re: [PATCH v3] app/dma-perf: introduce dma-perf application
> > 
> > On Tue, Jan 17, 2023 at 12:05:26PM +0000, Cheng Jiang wrote:
> > > There are many high-performance DMA devices supported in DPDK now, and
> > > these DMA devices can also be integrated into other modules of DPDK as
> > > accelerators, such as Vhost. Before integrating DMA into applications,
> > > developers need to know the performance of these DMA devices in
> > > various scenarios and the performance of CPUs in the same scenario,
> > > such as different buffer lengths. Only in this way can we know the
> > > target performance of the application accelerated by using them. This
> > > patch introduces a high-performance testing tool, which supports
> > > comparing the performance of CPU and DMA in different scenarios
> > > automatically with a pre-set config file. Memory Copy performance test are
> > supported for now.
> > >
> > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> > 
> > More input based off trying running the application, including some thoughts on
> > the testing methodology below.
> > 
> > 
> > > +static void
> > > +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id,
> > uint64_t ave_cycle,
> > > +			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> > > +			float bandwidth, uint64_t ops, bool is_dma) {
> > > +	if (is_dma)
> > > +		printf("lcore %u, DMA %u:\n"
> > > +				"average cycles: %" PRIu64 ","
> > > +				" buffer size: %u, nr_buf: %u,"
> > > +				" memory: %uMB, frequency: %" PRIu64 ".\n",
> > > +				lcore_id,
> > > +				dev_id,
> > > +				ave_cycle,
> > > +				buf_size,
> > > +				nr_buf,
> > > +				memory,
> > > +				rte_get_timer_hz());
> > > +	else
> > > +		printf("lcore %u\n"
> > > +			"average cycles: %" PRIu64 ","
> > > +			" buffer size: %u, nr_buf: %u,"
> > > +			" memory: %uMB, frequency: %" PRIu64 ".\n",
> > > +			lcore_id,
> > > +			ave_cycle,
> > > +			buf_size,
> > > +			nr_buf,
> > > +			memory,
> > > +			rte_get_timer_hz());
> > > +
> > 
> > The term "average cycles" is unclear here - is it average cycles per test iteration,
> > or average cycles per buffer copy?
> 
> The average cycles stands for average cycles per buffer copy, I'll clarify it in the next version.
> 
> > 
> > 
> > > +	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n",
> > > +bandwidth, ops);
> > > +
> > 
> > <snip>
> > 
> > > +
> > > +static inline void
> > > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch,
> > uint32_t buf_size,
> > > +			uint16_t mpool_iter_step, struct rte_mbuf **srcs,
> > struct rte_mbuf
> > > +**dsts) {
> > > +	int64_t async_cnt = 0;
> > > +	int nr_cpl = 0;
> > > +	uint32_t index;
> > > +	uint16_t offset;
> > > +	uint32_t i;
> > > +
> > > +	for (offset = 0; offset < mpool_iter_step; offset++) {
> > > +		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf;
> > i++) {
> > > +			if (unlikely(rte_dma_copy(dev_id,
> > > +						0,
> > > +						srcs[index]->buf_iova +
> > srcs[index]->data_off,
> > > +						dsts[index]->buf_iova +
> > dsts[index]->data_off,
> > > +						buf_size,
> > > +						0) < 0)) {
> > > +				rte_dma_submit(dev_id, 0);
> > > +				while (rte_dma_burst_capacity(dev_id, 0) == 0)
> > {
> > > +					nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB,
> > > +								NULL, NULL);
> > > +					async_cnt -= nr_cpl;
> > > +				}
> > > +				if (rte_dma_copy(dev_id,
> > > +						0,
> > > +						srcs[index]->buf_iova +
> > srcs[index]->data_off,
> > > +						dsts[index]->buf_iova +
> > dsts[index]->data_off,
> > > +						buf_size,
> > > +						0) < 0) {
> > > +					printf("enqueue fail again at %u\n",
> > index);
> > > +					printf("space:%d\n",
> > rte_dma_burst_capacity(dev_id, 0));
> > > +					rte_exit(EXIT_FAILURE, "DMA enqueue
> > failed\n");
> > > +				}
> > > +			}
> > > +			async_cnt++;
> > > +
> > > +			/**
> > > +			 * When '&' is used to wrap an index, mask must be a
> > power of 2.
> > > +			 * That is, kick_batch must be 2^n.
> > > +			 */
> > > +			if (unlikely((async_cnt % kick_batch) == 0)) {
> > > +				rte_dma_submit(dev_id, 0);
> > > +				/* add a poll to avoid ring full */
> > > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > > +				async_cnt -= nr_cpl;
> > > +			}
> > > +		}
> > > +
> > > +		rte_dma_submit(dev_id, 0);
> > > +		while (async_cnt > 0) {
> > > +			nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > > +			async_cnt -= nr_cpl;
> > > +		}
> > 
> > I have a couple of concerns about the methodology for testing the HW DMA
> > performance. For example, the inclusion of that final block means that we are
> > including the latency of the copy operation in the result.
> > 
> > If the objective of the test application is to determine if it is cheaper for
> > software to offload a copy operation to HW or do it in SW, then the primary
> > concern is the HW offload cost. That offload cost should remain constant
> > irrespective of the size of the copy - since all you are doing is writing a descriptor
> > and reading a completion result. However, seeing the results of running the app,
> > I notice that the reported average cycles increases as the packet size increases,
> > which would tend to indicate that we are not giving a realistic measurement of
> > offload cost.
> 
> We are trying to compare the time required to complete a certain amount of
> work using DMA with the time required to complete it using CPU. I think in addition
> to the offload cost, the capability of the DMA itself is also an important factor to be considered.
> The offload cost should be constant , but when DMA copies memory of different lengths,
> the time costs are different. So the reported average cycles increases as the packet size increases.
> Therefore, this test result includes both offload cost and DMA operation cost. To some extent,
> it should be a relative realistic measurement result.
> 
> Do you think it makes sense to you?
> 

Hi,

Yes, I get your point about the job latency being different when the
packet/copy sizes increase, but on the other hand, as I state above the
actual cycle cost to the application should not increase. If any
application is doing what this test app is doing, just sitting around
waiting for job completion (in the fast path), then it is likely that the
programmer should look at improving the offload into the app.

The main issue here is that by outputting a single number, you are mixing
two separate values - both offload cost and job latency. If you want to
show the effects of larger/smaller packets on both, then you should output
both values separately. For most applications where you will offload copies
and do other work while the copy is being done, the offload cost is of
primary concern. For some applications the latency figure may also be
important, but in those cases the user will want to see the latency called
out explicitly, not just mixed up in a single figure with offload cost.

> > 
> > The trouble then becomes how to do so in a more realistic manner. The most
> > accurate way I can think of in a unit test like this is to offload <queue_size>
> > entries to the device and measure the cycles taken there. Then wait until such
> > time as all copies are completed (to eliminate the latency time, which in a real-
> > world case would be spent by a core doing something else), and then do a
> > second measurement of the time taken to process all the completions. In the
> > same way as for a SW copy, any time not spent in memcpy is not copy time, for
> > HW copies any time spent not writing descriptors or reading completions is not
> > part of the offload cost.
> 
> Agreed, we are thinking about adding offload cost as one of test results in the future.
> 
> > 
> > That said, doing the above is still not fully realistic, as a real-world app will likely
> > still have some amount of other overhead, for example, polling occasionally for
> > completions in between doing other work (though one would expect this to be
> > relatively cheap).  Similarly, if the submission queue fills, the app may have to
> > delay waiting for space to submit jobs, and therefore see some of the HW copy
> > latency.
> > 
> > Therefore, I think the most realistic way to measure this is to look at the rate of
> > operations while processing is being done in the middle of the test. For example,
> > if we have a simple packet processing application, running the application just
> > doing RX and TX and measuring the rate allows us to determine the basic packet
> > I/O cost. Adding in an offload to HW for each packet and again measuring the
> > rate, will then allow us to compute the true offload copy cost of the operation,
> > and should give us a number that remains flat even as packet size increases. For
> > previous work done on vhost with DMA acceleration, I believe we saw exactly
> > that - while SW PPS reduced as packet size increased, with HW copies the PPS
> > remained constant even as packet size increased.
> > 
> > The challenge to my mind, is therefore how to implement this in a suitable unit-
> > test style way, to fit into the framework you have given here. I would suggest
> > that the actual performance measurement needs to be done - not on a total
> > time - but on a fixed time basis within each test. For example, when doing HW
> > copies, 1ms into each test run, we need to snapshot the completed entries, and
> > then say 1ms later measure the number that have been completed since. In this
> > way, we avoid the initial startup latency while we wait for jobs to start
> > completing, and we avoid the final latency as we await the last job to complete.
> > We would also include time for some potentially empty polls, and if a queue size
> > is too small see that reflected in the performance too.
> 
> I understand your concerns, but I think maybe we are not discussing the same performance number here.
> We are trying to test the maximum bandwidth of DMA, and what you said is how to measure the offload cost more accurately if I understand it correctly.
> I think these two performance data are both important. Maybe we can add your test methodology as one of performance aspect for
> DMA in the future, I need to reconsider it and get back to you later.
> 

Max bandwidth of HW is a third and separate number from that of
offload-cost and latency. Again, it should be measured and reported separately
if you want the app to provide it.

Regards,

/Bruce


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-17 16:51   ` Bruce Richardson
  2023-01-28 13:32     ` Jiang, Cheng1
@ 2023-01-31  5:27     ` Hu, Jiayu
  1 sibling, 0 replies; 15+ messages in thread
From: Hu, Jiayu @ 2023-01-31  5:27 UTC (permalink / raw)
  To: Richardson, Bruce, Jiang, Cheng1
  Cc: thomas, mb, dev, Ding, Xuan, Ma, WenwuX, Wang, YuanX, He, Xingguang

Hi Bruce,

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Wednesday, January 18, 2023 12:52 AM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org;
> Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma,
> WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v3] app/dma-perf: introduce dma-perf application
> 
+
> > +static inline void
> > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t
> kick_batch, uint32_t buf_size,
> > +			uint16_t mpool_iter_step, struct rte_mbuf **srcs,
> struct rte_mbuf
> > +**dsts) {
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t index;
> > +	uint16_t offset;
> > +	uint32_t i;
> > +
> > +	for (offset = 0; offset < mpool_iter_step; offset++) {
> > +		for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf;
> i++) {
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						srcs[index]->buf_iova +
> srcs[index]->data_off,
> > +						dsts[index]->buf_iova +
> dsts[index]->data_off,
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0) {
> > +					nr_cpl = rte_dma_completed(dev_id,
> 0, MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						srcs[index]->buf_iova +
> srcs[index]->data_off,
> > +						dsts[index]->buf_iova +
> dsts[index]->data_off,
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> index);
> > +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			/**
> > +			 * When '&' is used to wrap an index, mask must be a
> power of 2.
> > +			 * That is, kick_batch must be 2^n.
> > +			 */
> > +			if (unlikely((async_cnt % kick_batch) == 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +			}
> > +		}
> > +
> > +		rte_dma_submit(dev_id, 0);
> > +		while (async_cnt > 0) {
> > +			nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> > +			async_cnt -= nr_cpl;
> > +		}
> 
> I have a couple of concerns about the methodology for testing the HW DMA
> performance. For example, the inclusion of that final block means that we
> are including the latency of the copy operation in the result.

The waiting time will not exceed the time of completing <SW queue size> jobs.
We also introduce initial startup time of waiting DMA starts to complete jobs.
But if the total jobs are massive, we can ignore the impact of them. However,
the problem is that we cannot guarantee it in current implementation, especially
when memory footprint is small.

> 
> If the objective of the test application is to determine if it is cheaper for
> software to offload a copy operation to HW or do it in SW, then the primary
> concern is the HW offload cost. That offload cost should remain constant
> irrespective of the size of the copy - since all you are doing is writing a
> descriptor and reading a completion result. However, seeing the results of
> running the app, I notice that the reported average cycles increases as the
> packet size increases, which would tend to indicate that we are not giving a
> realistic measurement of offload cost.

I agree with the point that offload cost is very important. When using DMA
in an async manner, for N jobs, total DMA processing cycles, total offload cost
cycles, and total core higher-level processing cycles determine the final
performance. The total offload cost can be roughly calculated by the average
offload cost times the number of offloading operations. And as a benchmark tool,
we can provide the average offload cost (i.e., total cycle of one submit, one kick
and one poll function call) which is irrespective of copy size. For example, for
1024 jobs and batch size 16, the total offload cost is 64 (1024/32) times average
offload cost.

Repeat 64 times {
	submit 16 jobs; // always success
	kick;
	poll; // 16 jobs are done
}

As you pointed out, the above estimation method may become not accurate if
applications call poll function occasionally or wait for space when fail to submit.
But offload cost is still a very important value for applications, as it shows the
basic cost of using DPDK dmadev library and it also provides a rough estimation
for applications.

> 
> The trouble then becomes how to do so in a more realistic manner. The most
> accurate way I can think of in a unit test like this is to offload <queue_size>
> entries to the device and measure the cycles taken there. Then wait until
> such time as all copies are completed (to eliminate the latency time, which in
> a real-world case would be spent by a core doing something else), and then
> do a second measurement of the time taken to process all the completions.
> In the same way as for a SW copy, any time not spent in memcpy is not copy
> time, for HW copies any time spent not writing descriptors or reading
> completions is not part of the offload cost.
> 
> That said, doing the above is still not fully realistic, as a real-world app will
> likely still have some amount of other overhead, for example, polling
> occasionally for completions in between doing other work (though one
> would expect this to be relatively cheap).  Similarly, if the submission queue
> fills, the app may have to delay waiting for space to submit jobs, and
> therefore see some of the HW copy latency.
> 
> Therefore, I think the most realistic way to measure this is to look at the rate
> of operations while processing is being done in the middle of the test. For
> example, if we have a simple packet processing application, running the
> application just doing RX and TX and measuring the rate allows us to
> determine the basic packet I/O cost. Adding in an offload to HW for each
> packet and again measuring the rate, will then allow us to compute the true
> offload copy cost of the operation, and should give us a number that remains
> flat even as packet size increases. For previous work done on vhost with
> DMA acceleration, I believe we saw exactly that - while SW PPS reduced as
> packet size increased, with HW copies the PPS remained constant even as
> packet size increased.
> 
> The challenge to my mind, is therefore how to implement this in a suitable
> unit-test style way, to fit into the framework you have given here. I would
> suggest that the actual performance measurement needs to be done - not
> on a total time - but on a fixed time basis within each test. For example,
> when doing HW copies, 1ms into each test run, we need to snapshot the
> completed entries, and then say 1ms later measure the number that have
> been completed since. In this way, we avoid the initial startup latency while
> we wait for jobs to start completing, and we avoid the final latency as we
> await the last job to complete. We would also include time for some
> potentially empty polls, and if a queue size is too small see that reflected in
> the performance too.

The method you mentioned above is like how we measure NIC RX/TX PPS,
where the main thread is in charge of snapshotting the completed jobs for a fixed
time for all worker threads. But the trouble is when to finish one test, as current
framework runs N test cases till all are completed. We may fix the testing time per
case, like 1s, and in each test, the core repeatedly feeds N jobs to DMA until running
out of the time.

Lastly, I want to point out is that the current DMA throughput is tested in an async
manner. The result will be different when using a sync manner. So the benchmark
tool need to tell user it in the final results.

Thanks,
Jiayu
> 
> Thoughts, input from others?
> 
> /Bruce

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH v3] app/dma-perf: introduce dma-perf application
  2023-01-30  9:20       ` Bruce Richardson
@ 2023-02-06 14:20         ` Jiang, Cheng1
  0 siblings, 0 replies; 15+ messages in thread
From: Jiang, Cheng1 @ 2023-02-06 14:20 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: thomas, mb, dev, Hu, Jiayu, Ding, Xuan, Ma, WenwuX, Wang, YuanX,
	He, Xingguang

Hi Bruce,

Replies are inline,

Thank,
Cheng

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Monday, January 30, 2023 5:20 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Cc: thomas@monjalon.net; mb@smartsharesystems.com; dev@dpdk.org;
> Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma,
> WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>
> Subject: Re: [PATCH v3] app/dma-perf: introduce dma-perf application
> 
> On Sat, Jan 28, 2023 at 01:32:05PM +0000, Jiang, Cheng1 wrote:
> > Hi Bruce,
> >
> > Sorry for the late reply. We are in the Spring Festival holiday last week.
> > Thanks for your comments.
> > Replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Richardson, Bruce <bruce.richardson@intel.com>
> > > Sent: Wednesday, January 18, 2023 12:52 AM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>
> > > Cc: thomas@monjalon.net; mb@smartsharesystems.com;
> dev@dpdk.org; Hu,
> > > Jiayu <jiayu.hu@intel.com>; Ding, Xuan <xuan.ding@intel.com>; Ma,
> > > WenwuX <wenwux.ma@intel.com>; Wang, YuanX
> <yuanx.wang@intel.com>;
> > > He, Xingguang <xingguang.he@intel.com>
> > > Subject: Re: [PATCH v3] app/dma-perf: introduce dma-perf application
> > >
> > > On Tue, Jan 17, 2023 at 12:05:26PM +0000, Cheng Jiang wrote:
> > > > There are many high-performance DMA devices supported in DPDK
> now,
> > > > and these DMA devices can also be integrated into other modules of
> > > > DPDK as accelerators, such as Vhost. Before integrating DMA into
> > > > applications, developers need to know the performance of these DMA
> > > > devices in various scenarios and the performance of CPUs in the
> > > > same scenario, such as different buffer lengths. Only in this way
> > > > can we know the target performance of the application accelerated
> > > > by using them. This patch introduces a high-performance testing
> > > > tool, which supports comparing the performance of CPU and DMA in
> > > > different scenarios automatically with a pre-set config file.
> > > > Memory Copy performance test are
> > > supported for now.
> > > >
> > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > ---
> > >
> > > More input based off trying running the application, including some
> > > thoughts on the testing methodology below.
> > >
> > >
> > > > +static void
> > > > +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t
> > > > +dev_id,
> > > uint64_t ave_cycle,
> > > > +			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> > > > +			float bandwidth, uint64_t ops, bool is_dma) {
> > > > +	if (is_dma)
> > > > +		printf("lcore %u, DMA %u:\n"
> > > > +				"average cycles: %" PRIu64 ","
> > > > +				" buffer size: %u, nr_buf: %u,"
> > > > +				" memory: %uMB, frequency: %" PRIu64
> ".\n",
> > > > +				lcore_id,
> > > > +				dev_id,
> > > > +				ave_cycle,
> > > > +				buf_size,
> > > > +				nr_buf,
> > > > +				memory,
> > > > +				rte_get_timer_hz());
> > > > +	else
> > > > +		printf("lcore %u\n"
> > > > +			"average cycles: %" PRIu64 ","
> > > > +			" buffer size: %u, nr_buf: %u,"
> > > > +			" memory: %uMB, frequency: %" PRIu64 ".\n",
> > > > +			lcore_id,
> > > > +			ave_cycle,
> > > > +			buf_size,
> > > > +			nr_buf,
> > > > +			memory,
> > > > +			rte_get_timer_hz());
> > > > +
> > >
> > > The term "average cycles" is unclear here - is it average cycles per
> > > test iteration, or average cycles per buffer copy?
> >
> > The average cycles stands for average cycles per buffer copy, I'll clarify it in
> the next version.
> >
> > >
> > >
> > > > +	printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n",
> > > > +bandwidth, ops);
> > > > +
> > >
> > > <snip>
> > >
> > > > +
> > > > +static inline void
> > > > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t
> > > > +kick_batch,
> > > uint32_t buf_size,
> > > > +			uint16_t mpool_iter_step, struct rte_mbuf **srcs,
> > > struct rte_mbuf
> > > > +**dsts) {
> > > > +	int64_t async_cnt = 0;
> > > > +	int nr_cpl = 0;
> > > > +	uint32_t index;
> > > > +	uint16_t offset;
> > > > +	uint32_t i;
> > > > +
> > > > +	for (offset = 0; offset < mpool_iter_step; offset++) {
> > > > +		for (i = 0; index = i * mpool_iter_step + offset, index <
> > > > +nr_buf;
> > > i++) {
> > > > +			if (unlikely(rte_dma_copy(dev_id,
> > > > +						0,
> > > > +						srcs[index]->buf_iova +
> > > srcs[index]->data_off,
> > > > +						dsts[index]->buf_iova +
> > > dsts[index]->data_off,
> > > > +						buf_size,
> > > > +						0) < 0)) {
> > > > +				rte_dma_submit(dev_id, 0);
> > > > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0)
> > > {
> > > > +					nr_cpl = rte_dma_completed(dev_id,
> 0,
> > > MAX_DMA_CPL_NB,
> > > > +								NULL, NULL);
> > > > +					async_cnt -= nr_cpl;
> > > > +				}
> > > > +				if (rte_dma_copy(dev_id,
> > > > +						0,
> > > > +						srcs[index]->buf_iova +
> > > srcs[index]->data_off,
> > > > +						dsts[index]->buf_iova +
> > > dsts[index]->data_off,
> > > > +						buf_size,
> > > > +						0) < 0) {
> > > > +					printf("enqueue fail again at %u\n",
> > > index);
> > > > +					printf("space:%d\n",
> > > rte_dma_burst_capacity(dev_id, 0));
> > > > +					rte_exit(EXIT_FAILURE, "DMA
> enqueue
> > > failed\n");
> > > > +				}
> > > > +			}
> > > > +			async_cnt++;
> > > > +
> > > > +			/**
> > > > +			 * When '&' is used to wrap an index, mask must be a
> > > power of 2.
> > > > +			 * That is, kick_batch must be 2^n.
> > > > +			 */
> > > > +			if (unlikely((async_cnt % kick_batch) == 0)) {
> > > > +				rte_dma_submit(dev_id, 0);
> > > > +				/* add a poll to avoid ring full */
> > > > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +				async_cnt -= nr_cpl;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		rte_dma_submit(dev_id, 0);
> > > > +		while (async_cnt > 0) {
> > > > +			nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +			async_cnt -= nr_cpl;
> > > > +		}
> > >
> > > I have a couple of concerns about the methodology for testing the HW
> > > DMA performance. For example, the inclusion of that final block
> > > means that we are including the latency of the copy operation in the
> result.
> > >
> > > If the objective of the test application is to determine if it is
> > > cheaper for software to offload a copy operation to HW or do it in
> > > SW, then the primary concern is the HW offload cost. That offload
> > > cost should remain constant irrespective of the size of the copy -
> > > since all you are doing is writing a descriptor and reading a
> > > completion result. However, seeing the results of running the app, I
> > > notice that the reported average cycles increases as the packet size
> > > increases, which would tend to indicate that we are not giving a realistic
> measurement of offload cost.
> >
> > We are trying to compare the time required to complete a certain
> > amount of work using DMA with the time required to complete it using
> > CPU. I think in addition to the offload cost, the capability of the DMA itself
> is also an important factor to be considered.
> > The offload cost should be constant , but when DMA copies memory of
> > different lengths, the time costs are different. So the reported average
> cycles increases as the packet size increases.
> > Therefore, this test result includes both offload cost and DMA
> > operation cost. To some extent, it should be a relative realistic
> measurement result.
> >
> > Do you think it makes sense to you?
> >
> 
> Hi,
> 
> Yes, I get your point about the job latency being different when the
> packet/copy sizes increase, but on the other hand, as I state above the actual
> cycle cost to the application should not increase. If any application is doing
> what this test app is doing, just sitting around waiting for job completion (in
> the fast path), then it is likely that the programmer should look at improving
> the offload into the app.
> 
> The main issue here is that by outputting a single number, you are mixing
> two separate values - both offload cost and job latency. If you want to show
> the effects of larger/smaller packets on both, then you should output both
> values separately. For most applications where you will offload copies and do
> other work while the copy is being done, the offload cost is of primary
> concern. For some applications the latency figure may also be important, but
> in those cases the user will want to see the latency called out explicitly, not
> just mixed up in a single figure with offload cost.

Sure, makes sense to me, thanks.

> 
> > >
> > > The trouble then becomes how to do so in a more realistic manner.
> > > The most accurate way I can think of in a unit test like this is to
> > > offload <queue_size> entries to the device and measure the cycles
> > > taken there. Then wait until such time as all copies are completed
> > > (to eliminate the latency time, which in a real- world case would be
> > > spent by a core doing something else), and then do a second
> > > measurement of the time taken to process all the completions. In the
> > > same way as for a SW copy, any time not spent in memcpy is not copy
> > > time, for HW copies any time spent not writing descriptors or reading
> completions is not part of the offload cost.
> >
> > Agreed, we are thinking about adding offload cost as one of test results in
> the future.
> >
> > >
> > > That said, doing the above is still not fully realistic, as a
> > > real-world app will likely still have some amount of other overhead,
> > > for example, polling occasionally for completions in between doing
> > > other work (though one would expect this to be relatively cheap).
> > > Similarly, if the submission queue fills, the app may have to delay
> > > waiting for space to submit jobs, and therefore see some of the HW copy
> latency.
> > >
> > > Therefore, I think the most realistic way to measure this is to look
> > > at the rate of operations while processing is being done in the
> > > middle of the test. For example, if we have a simple packet
> > > processing application, running the application just doing RX and TX
> > > and measuring the rate allows us to determine the basic packet I/O
> > > cost. Adding in an offload to HW for each packet and again measuring
> > > the rate, will then allow us to compute the true offload copy cost
> > > of the operation, and should give us a number that remains flat even
> > > as packet size increases. For previous work done on vhost with DMA
> > > acceleration, I believe we saw exactly that - while SW PPS reduced as
> packet size increased, with HW copies the PPS remained constant even as
> packet size increased.
> > >
> > > The challenge to my mind, is therefore how to implement this in a
> > > suitable unit- test style way, to fit into the framework you have
> > > given here. I would suggest that the actual performance measurement
> > > needs to be done - not on a total time - but on a fixed time basis
> > > within each test. For example, when doing HW copies, 1ms into each
> > > test run, we need to snapshot the completed entries, and then say
> > > 1ms later measure the number that have been completed since. In this
> > > way, we avoid the initial startup latency while we wait for jobs to start
> completing, and we avoid the final latency as we await the last job to
> complete.
> > > We would also include time for some potentially empty polls, and if
> > > a queue size is too small see that reflected in the performance too.
> >
> > I understand your concerns, but I think maybe we are not discussing the
> same performance number here.
> > We are trying to test the maximum bandwidth of DMA, and what you said
> is how to measure the offload cost more accurately if I understand it
> correctly.
> > I think these two performance data are both important. Maybe we can
> > add your test methodology as one of performance aspect for DMA in the
> future, I need to reconsider it and get back to you later.
> >
> 
> Max bandwidth of HW is a third and separate number from that of offload-
> cost and latency. Again, it should be measured and reported separately if you
> want the app to provide it.

OK, got it. We will try to implement such test method in the next version.

Thanks.
Cheng

> 
> Regards,
> 
> /Bruce


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] app/dma-perf: introduce dma-perf application
@ 2023-04-20  7:22 Cheng Jiang
  0 siblings, 0 replies; 15+ messages in thread
From: Cheng Jiang @ 2023-04-20  7:22 UTC (permalink / raw)
  To: thomas, bruce.richardson, mb
  Cc: dev, jiayu.hu, xuan.ding, wenwux.ma, yuanx.wang, xingguang.he,
	Cheng Jiang

There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 467 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  56 ++++
 app/test-dma-perf/main.c      | 445 ++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  56 ++++
 app/test-dma-perf/meson.build |  17 ++
 6 files changed, 1042 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..514cb2f7b2 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@ apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..36e3413bdc
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,467 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+uint16_t dmadev_ids[MAX_WORKER_NB];
+uint32_t nb_dmadevs;
+
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, uint32_t *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %u:\n", lcore_id, dev_id);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %uMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dev_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t nr_buf,
+		__maybe_unused uint32_t buf_size)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id %u\n", dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+
+static int
+config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
+{
+	int16_t dev_id = rte_dma_next_dev(0);
+	uint32_t i;
+
+	nb_dmadevs = 0;
+
+	for (i = 0; i < nb_workers; i++) {
+		if (dev_id == -1)
+			goto end;
+
+		dmadev_ids[i] = dev_id;
+		configure_dmadev_queue(dmadev_ids[i], ring_size);
+		++nb_dmadevs;
+		dev_id = rte_dma_next_dev(dev_id + 1);
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id  = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = cfg->nb_workers;
+	uint16_t test_secs = cfg->test_secs;
+	uint32_t memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dev_id = dmadev_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dev_id, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_dmadevs; i++) {
+			printf("Stopping dmadev %d\n", dmadev_ids[i]);
+			rte_dma_stop(dmadev_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..47bcdd2eb2
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,56 @@
+
+; Supported test types:
+; DMA_MEM_COPY|CPU_MEM_COPY
+
+; Parameters:
+; "mem_size","buf_size","dma_ring_size","kick_batch".
+; "mem_size" means the size of the memory footprint.
+; "buf_size" means the memory size of a single operation.
+; "dma_ring_size" means the dma ring buffer size.
+; "kick_batch" means dma operation batch size.
+
+; Format: variable=first[,last,increment[,ADD|MUL]]
+; ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to control if the cache should be flushed.
+
+; test_seconds is used to control the test time of the whole case.
+
+; worker_threads is used to control the threads number of the test app.
+; It should be less than the core number.
+
+; mpool_iter_step is used to control the buffer continuity.
+
+; Bind DMA to lcore:
+; Specify the "lcore_dma" parameter.
+; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
+; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
+; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
+; their range.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+worker_threads=1
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+worker_threads=1
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..00176db6d6
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,445 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->nb_workers) {
+		printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
+			case_id, nb_lcores, case_cfg->nb_workers);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static void
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d\n.", i + 1);
+			exit(1);
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s.\n", case_type);
+			exit(1);
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error\n");
+			break;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error\n");
+				break;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error\n");
+				break;
+			} else if (args_nr > 1)
+				nb_vp++;
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			break;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+		test_case->nb_workers = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "worker_threads"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+	}
+
+	rte_cfgfile_close(cfgfile);
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;	// skip the value argument
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		// if the result output file is not configured, let it base on the config file name.
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < MAX_TEST_CASES; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE)
+			continue;
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n");
+			else
+				printf("Case process unknown terminated.\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..235cb74daf
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct test_configure {
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	uint32_t nb_workers;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)
-- 
2.35.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-04-20  7:57 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-20  1:06 [PATCH] app/dma-perf: introduce dma-perf application Cheng Jiang
2023-01-17  1:56 ` [PATCH v2] " Cheng Jiang
2023-01-17 13:00   ` Bruce Richardson
2023-01-17 13:54     ` Jiang, Cheng1
2023-01-17 14:03       ` Bruce Richardson
2023-01-18  1:46         ` Jiang, Cheng1
2023-01-17 12:05 ` [PATCH v3] " Cheng Jiang
2023-01-17 15:44   ` Bruce Richardson
2023-01-19  7:18     ` Jiang, Cheng1
2023-01-17 16:51   ` Bruce Richardson
2023-01-28 13:32     ` Jiang, Cheng1
2023-01-30  9:20       ` Bruce Richardson
2023-02-06 14:20         ` Jiang, Cheng1
2023-01-31  5:27     ` Hu, Jiayu
2023-04-20  7:22 [PATCH] " Cheng Jiang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).