From: Cheng Jiang <cheng1.jiang@intel.com>
To: thomas@monjalon.net, bruce.richardson@intel.com,
mb@smartsharesystems.com
Cc: dev@dpdk.org, jiayu.hu@intel.com, xuan.ding@intel.com,
wenwux.ma@intel.com, yuanx.wang@intel.com,
xingguang.he@intel.com, Cheng Jiang <cheng1.jiang@intel.com>
Subject: [PATCH v3] app/dma-perf: introduce dma-perf application
Date: Tue, 17 Jan 2023 12:05:26 +0000 [thread overview]
Message-ID: <20230117120526.39375-1-cheng1.jiang@intel.com> (raw)
In-Reply-To: <20221220010619.31829-1-cheng1.jiang@intel.com>
There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.
Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
v3:
fixed compile issues for loongarch.
fixed compile issues for intel.
fixed coding style issues.
v2:
fixed some CI issues.
app/meson.build | 1 +
app/test-dma-perf/benchmark.c | 541 ++++++++++++++++++++++++++++++++++
app/test-dma-perf/benchmark.h | 12 +
app/test-dma-perf/config.ini | 61 ++++
app/test-dma-perf/main.c | 434 +++++++++++++++++++++++++++
app/test-dma-perf/main.h | 57 ++++
app/test-dma-perf/meson.build | 20 ++
7 files changed, 1126 insertions(+)
create mode 100644 app/test-dma-perf/benchmark.c
create mode 100644 app/test-dma-perf/benchmark.h
create mode 100644 app/test-dma-perf/config.ini
create mode 100644 app/test-dma-perf/main.c
create mode 100644 app/test-dma-perf/main.h
create mode 100644 app/test-dma-perf/meson.build
diff --git a/app/meson.build b/app/meson.build
index e32ea4bd5c..a060ad2725 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -28,6 +28,7 @@ apps = [
'test-regex',
'test-sad',
'test-security-perf',
+ 'test-dma-perf',
]
default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API']
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..7ec3f95643
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+
+#define MAX_DMA_CPL_NB 255
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" PRIu64 "\n"
+
+struct lcore_params {
+ uint16_t dev_id;
+ uint32_t nr_buf;
+ uint16_t kick_batch;
+ uint32_t buf_size;
+ uint32_t repeat_times;
+ uint16_t mpool_iter_step;
+ struct rte_mbuf **srcs;
+ struct rte_mbuf **dsts;
+ uint8_t scenario_id;
+};
+
+struct buf_info {
+ struct rte_mbuf **array;
+ uint32_t nr_buf;
+ uint32_t buf_size;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+uint16_t dmadev_ids[MAX_WORKER_NB];
+uint32_t nb_dmadevs;
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+ va_list ap;
+ int ret;
+
+ ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+ va_start(ap, format);
+ ret += vfprintf(stderr, format, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+static inline void
+calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec,
+ uint32_t repeat_times, uint32_t *memory, uint64_t *ave_cycle,
+ float *bandwidth, uint64_t *ops)
+{
+ *memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
+ *ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
+ *bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 * 1000 * 1000.0);
+ *ops = (double)p->nr_buf * repeat_times / time_sec;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, uint64_t ave_cycle,
+ uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
+ float bandwidth, uint64_t ops, bool is_dma)
+{
+ if (is_dma)
+ printf("lcore %u, DMA %u:\n"
+ "average cycles: %" PRIu64 ","
+ " buffer size: %u, nr_buf: %u,"
+ " memory: %uMB, frequency: %" PRIu64 ".\n",
+ lcore_id,
+ dev_id,
+ ave_cycle,
+ buf_size,
+ nr_buf,
+ memory,
+ rte_get_timer_hz());
+ else
+ printf("lcore %u\n"
+ "average cycles: %" PRIu64 ","
+ " buffer size: %u, nr_buf: %u,"
+ " memory: %uMB, frequency: %" PRIu64 ".\n",
+ lcore_id,
+ ave_cycle,
+ buf_size,
+ nr_buf,
+ memory,
+ rte_get_timer_hz());
+
+ printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, ops);
+
+ if (is_dma)
+ snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+ CSV_LINE_DMA_FMT,
+ scenario_id, lcore_id, dev_id, buf_size,
+ nr_buf, memory, ave_cycle, bandwidth, ops);
+ else
+ snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
+ CSV_LINE_CPU_FMT,
+ scenario_id, lcore_id, buf_size,
+ nr_buf, memory, ave_cycle, bandwidth, ops);
+}
+
+static inline void
+cache_flush_buf(void *arg __maybe_unused)
+{
+#ifdef RTE_ARCH_X86_64
+ char *data;
+ char *addr;
+ struct buf_info *info = arg;
+ struct rte_mbuf **srcs = info->array;
+ uint32_t i, k;
+
+ for (i = 0; i < info->nr_buf; i++) {
+ data = rte_pktmbuf_mtod(srcs[i], char *);
+ for (k = 0; k < info->buf_size / 64; k++) {
+ addr = (k * 64 + data);
+ __builtin_ia32_clflush(addr);
+ }
+ }
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+ uint16_t vchan = 0;
+ struct rte_dma_info info;
+ struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+ struct rte_dma_vchan_conf qconf = {
+ .direction = RTE_DMA_DIR_MEM_TO_MEM,
+ .nb_desc = ring_size
+ };
+
+ if (rte_dma_configure(dev_id, &dev_config) != 0)
+ rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+ if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
+ printf("Error with queue configuration\n");
+ rte_panic();
+ }
+
+ rte_dma_info_get(dev_id, &info);
+ if (info.nb_vchans != 1) {
+ printf("Error, no configured queues reported on device id %u\n", dev_id);
+ rte_panic();
+ }
+ if (rte_dma_start(dev_id) != 0)
+ rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+
+static int
+config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
+{
+ int16_t dev_id = rte_dma_next_dev(0);
+ uint32_t i;
+
+ nb_dmadevs = 0;
+
+ for (i = 0; i < nb_workers; i++) {
+ if (dev_id == -1)
+ goto end;
+
+ dmadev_ids[i] = dev_id;
+ configure_dmadev_queue(dmadev_ids[i], ring_size);
+ dev_id = rte_dma_next_dev(dev_id + 1);
+ ++nb_dmadevs;
+ }
+
+end:
+ if (nb_dmadevs < nb_workers) {
+ printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+ return -1;
+ }
+
+ RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs);
+
+ return 0;
+}
+
+static inline void
+do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, uint32_t buf_size,
+ uint16_t mpool_iter_step, struct rte_mbuf **srcs, struct rte_mbuf **dsts)
+{
+ int64_t async_cnt = 0;
+ int nr_cpl = 0;
+ uint32_t index;
+ uint16_t offset;
+ uint32_t i;
+
+ for (offset = 0; offset < mpool_iter_step; offset++) {
+ for (i = 0; index = i * mpool_iter_step + offset, index < nr_buf; i++) {
+ if (unlikely(rte_dma_copy(dev_id,
+ 0,
+ srcs[index]->buf_iova + srcs[index]->data_off,
+ dsts[index]->buf_iova + dsts[index]->data_off,
+ buf_size,
+ 0) < 0)) {
+ rte_dma_submit(dev_id, 0);
+ while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+ nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+ NULL, NULL);
+ async_cnt -= nr_cpl;
+ }
+ if (rte_dma_copy(dev_id,
+ 0,
+ srcs[index]->buf_iova + srcs[index]->data_off,
+ dsts[index]->buf_iova + dsts[index]->data_off,
+ buf_size,
+ 0) < 0) {
+ printf("enqueue fail again at %u\n", index);
+ printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+ rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+ }
+ }
+ async_cnt++;
+
+ /**
+ * When '&' is used to wrap an index, mask must be a power of 2.
+ * That is, kick_batch must be 2^n.
+ */
+ if (unlikely((async_cnt % kick_batch) == 0)) {
+ rte_dma_submit(dev_id, 0);
+ /* add a poll to avoid ring full */
+ nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+ async_cnt -= nr_cpl;
+ }
+ }
+
+ rte_dma_submit(dev_id, 0);
+ while (async_cnt > 0) {
+ nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+ async_cnt -= nr_cpl;
+ }
+ }
+}
+
+static int
+dma_mem_copy(void *p)
+{
+ uint64_t ops;
+ uint32_t memory;
+ float bandwidth;
+ double time_sec;
+ uint32_t lcore_id = rte_lcore_id();
+ struct lcore_params *params = (struct lcore_params *)p;
+ uint32_t repeat_times = params->repeat_times;
+ uint32_t buf_size = params->buf_size;
+ uint16_t kick_batch = params->kick_batch;
+ uint32_t lcore_nr_buf = params->nr_buf;
+ uint16_t dev_id = params->dev_id;
+ uint16_t mpool_iter_step = params->mpool_iter_step;
+ struct rte_mbuf **srcs = params->srcs;
+ struct rte_mbuf **dsts = params->dsts;
+ uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+ uint32_t r;
+
+ begin = rte_rdtsc();
+
+ for (r = 0; r < repeat_times; r++)
+ do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size,
+ mpool_iter_step, srcs, dsts);
+
+ end = rte_rdtsc();
+ total_cycles = end - begin;
+ time_sec = (double)total_cycles / rte_get_timer_hz();
+
+ calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+ &avg_cycles, &bandwidth, &ops);
+ output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, buf_size, lcore_nr_buf,
+ memory, bandwidth, ops, true);
+
+ rte_free(p);
+
+ return 0;
+}
+
+static int
+cpu_mem_copy(void *p)
+{
+ uint32_t idx;
+ uint32_t lcore_id;
+ uint32_t memory;
+ uint64_t ops;
+ float bandwidth;
+ double time_sec;
+ struct lcore_params *params = (struct lcore_params *)p;
+ uint32_t repeat_times = params->repeat_times;
+ uint32_t buf_size = params->buf_size;
+ uint32_t lcore_nr_buf = params->nr_buf;
+ uint16_t mpool_iter_step = params->mpool_iter_step;
+ struct rte_mbuf **srcs = params->srcs;
+ struct rte_mbuf **dsts = params->dsts;
+ uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
+ uint32_t k, j, offset;
+
+ begin = rte_rdtsc();
+
+ for (k = 0; k < repeat_times; k++) {
+ /* copy buffer form src to dst */
+ for (offset = 0; offset < mpool_iter_step; offset++) {
+ for (j = 0; idx = j * mpool_iter_step + offset, idx < lcore_nr_buf; j++) {
+ rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
+ (void *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
+ (size_t)buf_size);
+ }
+ }
+ }
+
+ end = rte_rdtsc();
+ total_cycles = end - begin;
+ time_sec = (double)total_cycles / rte_get_timer_hz();
+
+ lcore_id = rte_lcore_id();
+
+ calc_result(params, total_cycles, time_sec, repeat_times, &memory,
+ &avg_cycles, &bandwidth, &ops);
+ output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, lcore_nr_buf,
+ memory, bandwidth, ops, false);
+
+ rte_free(p);
+
+ return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+ struct rte_mbuf ***dsts)
+{
+ uint32_t i;
+ unsigned int buf_size = cfg->buf_size.cur;
+ unsigned int nr_sockets;
+ uint32_t nr_buf = cfg->nr_buf;
+
+ nr_sockets = rte_socket_count();
+ if (cfg->src_numa_node >= nr_sockets ||
+ cfg->dst_numa_node >= nr_sockets) {
+ printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+ return -1;
+ }
+
+ src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+ nr_buf, /* n == num elements */
+ 64, /* cache size */
+ 0, /* priv size */
+ buf_size + RTE_PKTMBUF_HEADROOM,
+ cfg->src_numa_node);
+ if (src_pool == NULL) {
+ PRINT_ERR("Error with source mempool creation.\n");
+ return -1;
+ }
+
+ dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+ nr_buf, /* n == num elements */
+ 64, /* cache size */
+ 0, /* priv size */
+ buf_size + RTE_PKTMBUF_HEADROOM,
+ cfg->dst_numa_node);
+ if (dst_pool == NULL) {
+ PRINT_ERR("Error with destination mempool creation.\n");
+ return -1;
+ }
+
+ *srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+ if (*srcs == NULL) {
+ printf("Error: srcs malloc failed.\n");
+ return -1;
+ }
+
+ *dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf *)));
+ if (*dsts == NULL) {
+ printf("Error: dsts malloc failed.\n");
+ return -1;
+ }
+
+ for (i = 0; i < nr_buf; i++) {
+ (*srcs)[i] = rte_pktmbuf_alloc(src_pool);
+ (*dsts)[i] = rte_pktmbuf_alloc(dst_pool);
+ if ((!(*srcs)[i]) || (!(*dsts)[i])) {
+ printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
+ return -1;
+ }
+
+ (*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
+ (*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;
+ }
+
+ return 0;
+}
+
+void
+dma_mem_copy_benchmark(struct test_configure *cfg)
+{
+ uint32_t i;
+ uint32_t offset;
+ unsigned int lcore_id = 0;
+ struct rte_mbuf **srcs = NULL, **dsts = NULL;
+ unsigned int buf_size = cfg->buf_size.cur;
+ uint16_t kick_batch = cfg->kick_batch.cur;
+ uint16_t mpool_iter_step = cfg->mpool_iter_step;
+ uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+ uint16_t nb_workers = cfg->nb_workers;
+ uint32_t repeat_times = cfg->repeat_times;
+
+ if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+ goto out;
+
+ if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
+ goto out;
+
+ if (cfg->cache_flush) {
+ struct buf_info info;
+
+ info.array = srcs;
+ info.buf_size = buf_size;
+ info.nr_buf = nr_buf;
+ cache_flush_buf(&info);
+
+ info.array = dsts;
+ cache_flush_buf(&info);
+ rte_mb();
+ }
+
+ printf("Start testing....\n");
+
+ for (i = 0; i < nb_workers; i++) {
+ lcore_id = rte_get_next_lcore(lcore_id, true, true);
+ offset = nr_buf / nb_workers * i;
+
+ struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+ if (!p) {
+ printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+ break;
+ }
+ *p = (struct lcore_params) {
+ dmadev_ids[i],
+ (uint32_t)(nr_buf/nb_workers),
+ kick_batch,
+ buf_size,
+ repeat_times,
+ mpool_iter_step,
+ srcs + offset,
+ dsts + offset,
+ cfg->scenario_id
+ };
+
+ rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, lcore_id);
+ }
+
+ rte_eal_mp_wait_lcore();
+
+out:
+ /* free env */
+ if (srcs) {
+ for (i = 0; i < nr_buf; i++)
+ rte_pktmbuf_free(srcs[i]);
+ free(srcs);
+ }
+ if (dsts) {
+ for (i = 0; i < nr_buf; i++)
+ rte_pktmbuf_free(dsts[i]);
+ free(dsts);
+ }
+
+ if (src_pool)
+ rte_mempool_free(src_pool);
+ if (dst_pool)
+ rte_mempool_free(dst_pool);
+
+ for (i = 0; i < nb_dmadevs; i++) {
+ printf("Stopping dmadev %d\n", dmadev_ids[i]);
+ rte_dma_stop(dmadev_ids[i]);
+ }
+}
+
+void
+cpu_mem_copy_benchmark(struct test_configure *cfg)
+{
+ uint32_t i, offset;
+ uint32_t repeat_times = cfg->repeat_times;
+ uint32_t kick_batch = cfg->kick_batch.cur;
+ uint32_t buf_size = cfg->buf_size.cur;
+ uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+ uint16_t nb_workers = cfg->nb_workers;
+ uint16_t mpool_iter_step = cfg->mpool_iter_step;
+ struct rte_mbuf **srcs = NULL, **dsts = NULL;
+ unsigned int lcore_id = 0;
+
+ if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+ goto out;
+
+ for (i = 0; i < nb_workers; i++) {
+ lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? 1 : 0, 1);
+ offset = nr_buf / nb_workers * i;
+ struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
+ if (!p) {
+ printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+ break;
+ }
+ *p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch,
+ buf_size, repeat_times, mpool_iter_step,
+ srcs + offset, dsts + offset, cfg->scenario_id };
+ rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, lcore_id);
+ }
+
+ rte_eal_mp_wait_lcore();
+
+out:
+ /* free env */
+ if (srcs) {
+ for (i = 0; i < nr_buf; i++)
+ rte_pktmbuf_free(srcs[i]);
+ free(srcs);
+ }
+ if (dsts) {
+ for (i = 0; i < nr_buf; i++)
+ rte_pktmbuf_free(dsts[i]);
+ free(dsts);
+ }
+
+ if (src_pool)
+ rte_mempool_free(src_pool);
+ if (dst_pool)
+ rte_mempool_free(dst_pool);
+}
diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h
new file mode 100644
index 0000000000..f5ad8d6d99
--- /dev/null
+++ b/app/test-dma-perf/benchmark.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _BENCHMARK_H_
+#define _BENCHMARK_H_
+
+void dma_mem_copy_benchmark(struct test_configure *cfg);
+
+void cpu_mem_copy_benchmark(struct test_configure *cfg);
+
+#endif /* _BENCHMARK_H_ */
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..e24bb19414
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,61 @@
+
+; Supported test types:
+; DMA_MEM_COPY|CPU_MEM_COPY
+
+; Parameters:
+; "mem_size","buf_size","dma_ring_size","kick_batch".
+; "mem_size" means the size of the memory footprint.
+; "buf_size" means the memory size of a single operation.
+; "dma_ring_size" means the dma ring buffer size.
+; "kick_batch" means dma operation batch size.
+
+; Format: variable=first[,last,increment[,ADD|MUL]]
+; ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to control if the cache should be flushed.
+
+; repeat_times is used to control the repeat times of the whole case.
+
+; worker_threads is used to control the threads number of the test app.
+; It should be less than the core number.
+
+; mpool_iter_step is used to control the buffer continuity.
+
+; Bind DMA to lcore:
+; Specify the "lcore_dma" parameter.
+; The number of "lcore_dma" should be greater than or equal to the number of "worker_threads".
+; Otherwise the remaining DMA devices will be automatically allocated to threads that are not
+; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" should be within
+; their range.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+repeat_times=10
+worker_threads=1
+mpool_iter_step=1
+lcore_dma=lcore3@0000:00:04.0
+eal_args=--legacy-mem --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+repeat_times=100
+worker_threads=1
+mpool_iter_step=1
+eal_args=--no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..8041f5fdaf
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,434 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdio.h>
+#if !defined(RTE_EXEC_ENV_LINUX)
+
+int
+main(int argc, char *argv[])
+{
+ printf("OS not supported, skipping test\n");
+ return 0;
+}
+
+#else
+
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+#include "benchmark.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),OPS\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+enum {
+ TEST_TYPE_NONE = 0,
+ TEST_TYPE_DMA_MEM_COPY,
+ TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+ uint32_t i;
+
+ if (need_blankline) {
+ fprintf(fd, "%s", ",,,,,,,,\n");
+ fprintf(fd, "%s", ",,,,,,,,\n");
+ }
+
+ for (i = 0; i < RTE_DIM(output_str); i++) {
+ if (output_str[i][0]) {
+ fprintf(fd, "%s", output_str[i]);
+ memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+ }
+ }
+
+ fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+ snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+ snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "frequency,%" PRIu64 "\n", rte_get_timer_hz());
+
+ output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+ snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+ CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+ output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+ switch (case_cfg->test_type) {
+ case TEST_TYPE_DMA_MEM_COPY:
+ dma_mem_copy_benchmark(case_cfg);
+ break;
+ case TEST_TYPE_CPU_MEM_COPY:
+ cpu_mem_copy_benchmark(case_cfg);
+ break;
+ default:
+ printf("Unknown test type. %s\n", case_cfg->test_type_str);
+ break;
+ }
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+ uint32_t i;
+ uint32_t nb_lcores = rte_lcore_count();
+ struct test_configure_entry *mem_size = &case_cfg->mem_size;
+ struct test_configure_entry *buf_size = &case_cfg->buf_size;
+ struct test_configure_entry *ring_size = &case_cfg->ring_size;
+ struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+ struct test_configure_entry *var_entry = NULL;
+
+ for (i = 0; i < RTE_DIM(output_str); i++)
+ memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+ if (nb_lcores <= case_cfg->nb_workers) {
+ printf("Case %u: Not enough lcores (%u) for all workers (%u).\n",
+ case_id, nb_lcores, case_cfg->nb_workers);
+ return;
+ }
+
+ RTE_LOG(INFO, DMA, "Number of used lcores: %u.\n", nb_lcores);
+
+ if (mem_size->incr != 0)
+ var_entry = mem_size;
+
+ if (buf_size->incr != 0)
+ var_entry = buf_size;
+
+ if (ring_size->incr != 0)
+ var_entry = ring_size;
+
+ if (kick_batch->incr != 0)
+ var_entry = kick_batch;
+
+ case_cfg->scenario_id = 0;
+
+ output_header(case_id, case_cfg);
+
+ if (var_entry) {
+ for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+ case_cfg->scenario_id++;
+ printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+ run_test_case(case_cfg);
+ output_csv(false);
+
+ if (var_entry->op == OP_MUL)
+ var_entry->cur *= var_entry->incr;
+ else
+ var_entry->cur += var_entry->incr;
+
+
+ }
+ } else {
+ run_test_case(case_cfg);
+ output_csv(false);
+ }
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+ char input[255] = {0};
+ char *args[MAX_PARAMS_PER_ENTRY];
+ int args_nr = -1;
+
+ strncpy(input, value, 254);
+ if (*input == '\0')
+ goto out;
+
+ args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+ if (args_nr <= 0)
+ goto out;
+
+ entry->cur = entry->first = (uint32_t)atoi(args[0]);
+ entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+ entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+ if (args_nr > 3) {
+ if (!strcmp(args[3], "MUL"))
+ entry->op = OP_MUL;
+ else
+ entry->op = OP_ADD;
+ } else
+ entry->op = OP_NONE;
+out:
+ return args_nr;
+}
+
+static void
+load_configs(void)
+{
+ struct rte_cfgfile *cfgfile;
+ int nb_sections, i;
+ struct test_configure *test_case;
+ char **sections_name;
+ const char *section_name, *case_type;
+ const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+ int args_nr, nb_vp;
+
+ sections_name = malloc(MAX_TEST_CASES * sizeof(char *));
+ for (i = 0; i < MAX_TEST_CASES; i++)
+ sections_name[i] = malloc(CFG_NAME_LEN * sizeof(char *));
+
+ cfgfile = rte_cfgfile_load("./config.ini", 0);
+ if (!cfgfile) {
+ printf("Open configure file error.\n");
+ exit(1);
+ }
+
+ nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+ if (nb_sections > MAX_TEST_CASES) {
+ printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+ exit(1);
+ }
+ rte_cfgfile_sections(cfgfile, sections_name, MAX_TEST_CASES);
+ for (i = 0; i < nb_sections; i++) {
+ test_case = &test_cases[i];
+ section_name = sections_name[i];
+ case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+ if (!case_type) {
+ printf("Error: No case type in case %d\n.", i + 1);
+ exit(1);
+ }
+ if (!strcmp(case_type, DMA_MEM_COPY)) {
+ test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+ test_case->test_type_str = DMA_MEM_COPY;
+ } else if (!strcmp(case_type, CPU_MEM_COPY)) {
+ test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+ test_case->test_type_str = CPU_MEM_COPY;
+ } else {
+ printf("Error: Cannot find case type %s.\n", case_type);
+ exit(1);
+ }
+
+ nb_vp = 0;
+
+ test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+ section_name, "src_numa_node"));
+ test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+ section_name, "dst_numa_node"));
+
+ mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+ args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+ if (args_nr < 0) {
+ printf("parse error\n");
+ break;
+ } else if (args_nr > 1)
+ nb_vp++;
+
+ buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+ args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+ if (args_nr < 0) {
+ printf("parse error\n");
+ break;
+ } else if (args_nr > 1)
+ nb_vp++;
+
+ ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "dma_ring_size");
+ args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+ if (args_nr < 0) {
+ printf("parse error\n");
+ break;
+ } else if (args_nr > 1)
+ nb_vp++;
+
+ kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+ args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+ if (args_nr < 0) {
+ printf("parse error\n");
+ break;
+ } else if (args_nr > 1)
+ nb_vp++;
+
+ if (nb_vp > 2) {
+ printf("%s, variable parameters can only have one.\n", section_name);
+ break;
+ }
+
+ test_case->cache_flush =
+ (int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+ test_case->repeat_times =
+ (uint32_t)atoi(rte_cfgfile_get_entry(cfgfile,
+ section_name, "repeat_times"));
+ test_case->nb_workers =
+ (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+ section_name, "worker_threads"));
+ test_case->mpool_iter_step =
+ (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+ section_name, "mpool_iter_step"));
+
+ test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+ }
+
+ rte_cfgfile_close(cfgfile);
+ for (i = 0; i < MAX_TEST_CASES; i++) {
+ if (sections_name[i] != NULL)
+ free(sections_name[i]);
+ }
+ free(sections_name);
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+ int i;
+ char *tokens[MAX_EAL_PARAM_NB];
+ char args[MAX_EAL_PARAM_LEN] = {0};
+ int new_argc, token_nb;
+
+ new_argc = argc;
+
+ for (i = 0; i < argc; i++)
+ strcpy(new_argv[i], argv[i]);
+
+ if (eal_args) {
+ strcpy(args, eal_args);
+ token_nb = rte_strsplit(args, strlen(args),
+ tokens, MAX_EAL_PARAM_NB, ' ');
+ for (i = 0; i < token_nb; i++)
+ strcpy(new_argv[new_argc++], tokens[i]);
+ }
+
+ return new_argc;
+}
+
+int
+main(int argc __maybe_unused, char *argv[] __maybe_unused)
+{
+ int ret;
+ uint32_t i, nb_lcores;
+ pid_t cpid, wpid;
+ int wstatus;
+ char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+ char *pargs[100];
+ int new_argc;
+
+
+ memset(args, 0, sizeof(args));
+ for (i = 0; i < 100; i++)
+ pargs[i] = args[i];
+
+ load_configs();
+ fd = fopen("./test_result.csv", "w");
+ if (!fd) {
+ printf("Open output CSV file error.\n");
+ return 0;
+ }
+ fclose(fd);
+
+ /* loop each case, run it */
+ for (i = 0; i < MAX_TEST_CASES; i++) {
+ if (test_cases[i].test_type != TEST_TYPE_NONE) {
+ cpid = fork();
+ if (cpid < 0) {
+ printf("Fork case %d failed.\n", i + 1);
+ exit(EXIT_FAILURE);
+ } else if (cpid == 0) {
+ printf("\nRunning case %u\n", i + 1);
+
+ if (test_cases[i].eal_args) {
+ new_argc = append_eal_args(argc, argv,
+ test_cases[i].eal_args, pargs);
+
+ ret = rte_eal_init(new_argc, pargs);
+ } else {
+ ret = rte_eal_init(argc, argv);
+ }
+ if (ret < 0)
+ rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+ /* Check lcores. */
+ nb_lcores = rte_lcore_count();
+ if (nb_lcores < 2)
+ rte_exit(EXIT_FAILURE,
+ "There should be at least 2 worker lcores.\n");
+
+ fd = fopen("./test_result.csv", "a");
+ if (!fd) {
+ printf("Open output CSV file error.\n");
+ return 0;
+ }
+
+ if (i == 0)
+ output_env_info();
+ run_test(i + 1, &test_cases[i]);
+
+ /* clean up the EAL */
+ rte_eal_cleanup();
+
+ fclose(fd);
+
+ printf("\nCase %u completed.\n", i + 1);
+
+ exit(EXIT_SUCCESS);
+ } else {
+ wpid = waitpid(cpid, &wstatus, 0);
+ if (wpid == -1) {
+ printf("waitpid error.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (WIFEXITED(wstatus))
+ printf("Case process exited. status %d\n",
+ WEXITSTATUS(wstatus));
+ else if (WIFSIGNALED(wstatus))
+ printf("Case process killed by signal %d\n",
+ WTERMSIG(wstatus));
+ else if (WIFSTOPPED(wstatus))
+ printf("Case process stopped by signal %d\n",
+ WSTOPSIG(wstatus));
+ else if (WIFCONTINUED(wstatus))
+ printf("Case process continued.\n");
+ else
+ printf("Case process unknown terminated.\n");
+ }
+ }
+ }
+
+ printf("Bye...\n");
+ return 0;
+}
+
+#endif
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..a8fcf4f34d
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused __rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+ OP_NONE = 0,
+ OP_ADD,
+ OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+ uint32_t first;
+ uint32_t last;
+ uint32_t incr;
+ alg_op_type op;
+ uint32_t cur;
+};
+
+struct test_configure {
+ uint8_t test_type;
+ const char *test_type_str;
+ uint16_t src_numa_node;
+ uint16_t dst_numa_node;
+ uint16_t opcode;
+ bool is_dma;
+ struct test_configure_entry mem_size;
+ struct test_configure_entry buf_size;
+ struct test_configure_entry ring_size;
+ struct test_configure_entry kick_batch;
+ uint32_t cache_flush;
+ uint32_t nr_buf;
+ uint32_t repeat_times;
+ uint32_t nb_workers;
+ uint16_t mpool_iter_step;
+ const char *eal_args;
+ uint8_t scenario_id;
+};
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..001f67f6c1
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2022 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+if is_windows
+ build = false
+ reason = 'not supported on Windows'
+ subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+ 'main.c',
+ 'benchmark.c',
+)
--
2.35.1
next prev parent reply other threads:[~2023-01-17 12:55 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-12-20 1:06 [PATCH] " Cheng Jiang
2023-01-17 1:56 ` [PATCH v2] " Cheng Jiang
2023-01-17 13:00 ` Bruce Richardson
2023-01-17 13:54 ` Jiang, Cheng1
2023-01-17 14:03 ` Bruce Richardson
2023-01-18 1:46 ` Jiang, Cheng1
2023-01-17 12:05 ` Cheng Jiang [this message]
2023-01-17 15:44 ` [PATCH v3] " Bruce Richardson
2023-01-19 7:18 ` Jiang, Cheng1
2023-01-17 16:51 ` Bruce Richardson
2023-01-28 13:32 ` Jiang, Cheng1
2023-01-30 9:20 ` Bruce Richardson
2023-02-06 14:20 ` Jiang, Cheng1
2023-01-31 5:27 ` Hu, Jiayu
2023-04-20 7:22 [PATCH] " Cheng Jiang
2023-05-17 7:31 ` [PATCH v3] " Cheng Jiang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230117120526.39375-1-cheng1.jiang@intel.com \
--to=cheng1.jiang@intel.com \
--cc=bruce.richardson@intel.com \
--cc=dev@dpdk.org \
--cc=jiayu.hu@intel.com \
--cc=mb@smartsharesystems.com \
--cc=thomas@monjalon.net \
--cc=wenwux.ma@intel.com \
--cc=xingguang.he@intel.com \
--cc=xuan.ding@intel.com \
--cc=yuanx.wang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).