DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] app/dma-perf: replace pktmbuf with mempool objects
@ 2023-12-12 10:37 Vipin Varghese
  2023-12-12 11:40 ` Morten Brørup
  2023-12-20 11:03 ` [PATCH v2] " Vipin Varghese
  0 siblings, 2 replies; 16+ messages in thread
From: Vipin Varghese @ 2023-12-12 10:37 UTC (permalink / raw)
  To: dev, stable, honest.jiang; +Cc: Thiyagrajan P

Replace pktmbuf pool with mempool, this allows increase in MOPS
especially in lower buffer size. Using Mempool, allows to reduce
the extra CPU cycles.

v1 changes:
 1. pktmbuf pool create with mempool create.
 2. create src & dst pointer array from the appropaite numa.
 3. use get pool and put for mempool objects.
 4. remove pktmbuf_mtod for dma and cpu memcpy.

Test Results for pktmbuf vs mempool:
====================================

Format: Buffer Size | % AVG cycles | % AVG Gbps

Category-1: HW-DSA
-------------------
  64|-13.11| 14.97
 128|-41.49|  0.41
 256| -1.85|  1.20
 512| -9.38|  8.81
1024|  1.82| -2.00
1518|  0.00| -0.80
2048|  1.03| -0.91
4096|  0.00| -0.35
8192|  0.07| -0.08

Category-2: MEMCPY
-------------------
  64|-12.50|14.14
 128|-40.63|67.26
 256|-38.78|59.35
 512|-30.26|43.36
1024|-21.80|27.04
1518|-16.23|19.33
2048|-14.75|16.81
4096| -9.56|10.01
8192| -3.32| 3.12

Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
---
 app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..dc6f16cc01 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -43,8 +43,8 @@ struct lcore_params {
 	uint16_t kick_batch;
 	uint32_t buf_size;
 	uint16_t test_secs;
-	struct rte_mbuf **srcs;
-	struct rte_mbuf **dsts;
+	void **srcs;
+	void **dsts;
 	volatile struct worker_info worker_info;
 };
 
@@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
 }
 
 static inline void
-cache_flush_buf(__rte_unused struct rte_mbuf **array,
+cache_flush_buf(__rte_unused void **array,
 		__rte_unused uint32_t buf_size,
 		__rte_unused uint32_t nr_buf)
 {
 #ifdef RTE_ARCH_X86_64
 	char *data;
-	struct rte_mbuf **srcs = array;
+	void **srcs = array;
 	uint32_t i, offset;
 
 	for (i = 0; i < nr_buf; i++) {
-		data = rte_pktmbuf_mtod(srcs[i], char *);
+		data = (char *) srcs[i];
 		for (offset = 0; offset < buf_size; offset += 64)
 			__builtin_ia32_clflush(data + offset);
 	}
@@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
 	const uint32_t nr_buf = para->nr_buf;
 	const uint16_t kick_batch = para->kick_batch;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint16_t nr_cpl;
 	uint64_t async_cnt = 0;
 	uint32_t i;
@@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
 dma_copy:
-			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
-				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			ret = rte_dma_copy(dev_id,
+					0,
+					(rte_iova_t) srcs[i],
+					(rte_iova_t) dsts[i],
+					buf_size,
+					0);
 			if (unlikely(ret < 0)) {
 				if (ret == -ENOSPC) {
 					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
@@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
 	volatile struct worker_info *worker_info = &(para->worker_info);
 	const uint32_t nr_buf = para->nr_buf;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint32_t i;
 
 	worker_info->stop_flag = false;
@@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
 
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
-			const void *src = rte_pktmbuf_mtod(dsts[i], void *);
-			void *dst = rte_pktmbuf_mtod(srcs[i], void *);
+			const void *src = (void *) dsts[i];
+			void *dst = (void *) srcs[i];
 
 			/* copy buffer form src to dst */
 			rte_memcpy(dst, src, (size_t)buf_size);
@@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg, void ***srcs,
+			void ***dsts)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
@@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
-	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+	src_pool = rte_mempool_create("Benchmark_DMA_SRC",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->src_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->src_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
 		return -1;
 	}
 
-	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+	dst_pool = rte_mempool_create("Benchmark_DMA_DST",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->dst_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->dst_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
 		return -1;
 	}
 
-	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
 	if (*srcs == NULL) {
 		printf("Error: srcs malloc failed.\n");
 		return -1;
 	}
 
-	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
 	if (*dsts == NULL) {
 		printf("Error: dsts malloc failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
-		printf("alloc src mbufs failed.\n");
+	if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src bufs failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
-		printf("alloc dst mbufs failed.\n");
+	if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst bufs failed.\n");
 		return -1;
 	}
 
@@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint16_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	void **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 out:
 	/* free mbufs used in the test */
 	if (srcs != NULL)
-		rte_pktmbuf_free_bulk(srcs, nr_buf);
+		rte_mempool_put_bulk(src_pool, srcs, nr_buf);
 	if (dsts != NULL)
-		rte_pktmbuf_free_bulk(dsts, nr_buf);
+		rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
 
 	/* free the points for the mbufs */
 	rte_free(srcs);
-- 
2.34.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 10:37 [PATCH] app/dma-perf: replace pktmbuf with mempool objects Vipin Varghese
@ 2023-12-12 11:40 ` Morten Brørup
  2023-12-12 14:38   ` Ferruh Yigit
  2023-12-20 11:03 ` [PATCH v2] " Vipin Varghese
  1 sibling, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2023-12-12 11:40 UTC (permalink / raw)
  To: Vipin Varghese, dev, stable, honest.jiang; +Cc: Thiyagrajan P

> From: Vipin Varghese [mailto:vipin.varghese@amd.com]
> Sent: Tuesday, 12 December 2023 11.38
> 
> Replace pktmbuf pool with mempool, this allows increase in MOPS
> especially in lower buffer size. Using Mempool, allows to reduce
> the extra CPU cycles.

I get the point of this change: It tests the performance of copying raw memory objects using respectively rte_memcpy and DMA, without the mbuf indirection overhead.

However, I still consider the existing test relevant: The performance of copying packets using respectively rte_memcpy and DMA.

So, instead of replacing mbufs with mempool objects, please add new test variants using mempool objects.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 11:40 ` Morten Brørup
@ 2023-12-12 14:38   ` Ferruh Yigit
  2023-12-12 15:16     ` Morten Brørup
  0 siblings, 1 reply; 16+ messages in thread
From: Ferruh Yigit @ 2023-12-12 14:38 UTC (permalink / raw)
  To: Morten Brørup, Vipin Varghese, dev, stable, honest.jiang
  Cc: Thiyagrajan P

On 12/12/2023 11:40 AM, Morten Brørup wrote:
>> From: Vipin Varghese [mailto:vipin.varghese@amd.com]
>> Sent: Tuesday, 12 December 2023 11.38
>>
>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>> especially in lower buffer size. Using Mempool, allows to reduce
>> the extra CPU cycles.
> 
> I get the point of this change: It tests the performance of copying raw memory objects using respectively rte_memcpy and DMA, without the mbuf indirection overhead.
> 
> However, I still consider the existing test relevant: The performance of copying packets using respectively rte_memcpy and DMA.
> 

This is DMA performance test application and packets are not used, using
pktmbuf just introduces overhead to the main focus of the application.

I am not sure if pktmuf selected intentionally for this test
application, but I assume it is there because of historical reasons.


> So, instead of replacing mbufs with mempool objects, please add new test variants using mempool objects.
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 14:38   ` Ferruh Yigit
@ 2023-12-12 15:16     ` Morten Brørup
  2023-12-12 15:37       ` Bruce Richardson
  0 siblings, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2023-12-12 15:16 UTC (permalink / raw)
  To: Ferruh Yigit, Vipin Varghese, dev, stable, honest.jiang,
	bruce.richardson
  Cc: Thiyagrajan P

+TO: Bruce, please stop me if I'm completely off track here.

> From: Ferruh Yigit [mailto:ferruh.yigit@amd.com]
> Sent: Tuesday, 12 December 2023 15.38
> 
> On 12/12/2023 11:40 AM, Morten Brørup wrote:
> >> From: Vipin Varghese [mailto:vipin.varghese@amd.com]
> >> Sent: Tuesday, 12 December 2023 11.38
> >>
> >> Replace pktmbuf pool with mempool, this allows increase in MOPS
> >> especially in lower buffer size. Using Mempool, allows to reduce
> >> the extra CPU cycles.
> >
> > I get the point of this change: It tests the performance of copying
> raw memory objects using respectively rte_memcpy and DMA, without the
> mbuf indirection overhead.
> >
> > However, I still consider the existing test relevant: The performance
> of copying packets using respectively rte_memcpy and DMA.
> >
> 
> This is DMA performance test application and packets are not used,
> using
> pktmbuf just introduces overhead to the main focus of the application.
> 
> I am not sure if pktmuf selected intentionally for this test
> application, but I assume it is there because of historical reasons.

I think pktmbuf was selected intentionally, to provide more accurate results for application developers trying to determine when to use rte_memcpy and when to use DMA. Much like the "copy breakpoint" in Linux Ethernet drivers is used to determine which code path to take for each received packet.

Most applications will be working with pktmbufs, so these applications will also experience the pktmbuf overhead. Performance testing with the same overhead as the application will be better to help the application developer determine when to use rte_memcpy and when to use DMA when working with pktmbufs.

(Furthermore, for the pktmbuf tests, I wonder if copying performance could also depend on IOVA mode and RTE_IOVA_IN_MBUF.)

Nonetheless, there may also be use cases where raw mempool objects are being copied by rte_memcpy or DMA, so adding tests for these use cases are useful.


@Bruce, you were also deeply involved in the DMA library, and probably have more up-to-date practical experience with it. Am I right that pktmbuf overhead in these tests provides more "real life use"-like results? Or am I completely off track with my thinking here, i.e. the pktmbuf overhead is only noise?

> 
> 
> > So, instead of replacing mbufs with mempool objects, please add new
> test variants using mempool objects.
> >


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 15:16     ` Morten Brørup
@ 2023-12-12 15:37       ` Bruce Richardson
  2023-12-12 17:13         ` Varghese, Vipin
  0 siblings, 1 reply; 16+ messages in thread
From: Bruce Richardson @ 2023-12-12 15:37 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Ferruh Yigit, Vipin Varghese, dev, stable, honest.jiang, Thiyagrajan P

On Tue, Dec 12, 2023 at 04:16:20PM +0100, Morten Brørup wrote:
> +TO: Bruce, please stop me if I'm completely off track here.
> 
> > From: Ferruh Yigit [mailto:ferruh.yigit@amd.com] Sent: Tuesday, 12
> > December 2023 15.38
> > 
> > On 12/12/2023 11:40 AM, Morten Brørup wrote:
> > >> From: Vipin Varghese [mailto:vipin.varghese@amd.com] Sent: Tuesday,
> > >> 12 December 2023 11.38
> > >>
> > >> Replace pktmbuf pool with mempool, this allows increase in MOPS
> > >> especially in lower buffer size. Using Mempool, allows to reduce the
> > >> extra CPU cycles.
> > >
> > > I get the point of this change: It tests the performance of copying
> > raw memory objects using respectively rte_memcpy and DMA, without the
> > mbuf indirection overhead.
> > >
> > > However, I still consider the existing test relevant: The performance
> > of copying packets using respectively rte_memcpy and DMA.
> > >
> > 
> > This is DMA performance test application and packets are not used,
> > using pktmbuf just introduces overhead to the main focus of the
> > application.
> > 
> > I am not sure if pktmuf selected intentionally for this test
> > application, but I assume it is there because of historical reasons.
> 
> I think pktmbuf was selected intentionally, to provide more accurate
> results for application developers trying to determine when to use
> rte_memcpy and when to use DMA. Much like the "copy breakpoint" in Linux
> Ethernet drivers is used to determine which code path to take for each
> received packet.
> 
> Most applications will be working with pktmbufs, so these applications
> will also experience the pktmbuf overhead. Performance testing with the
> same overhead as the application will be better to help the application
> developer determine when to use rte_memcpy and when to use DMA when
> working with pktmbufs.
> 
> (Furthermore, for the pktmbuf tests, I wonder if copying performance
> could also depend on IOVA mode and RTE_IOVA_IN_MBUF.)
> 
> Nonetheless, there may also be use cases where raw mempool objects are
> being copied by rte_memcpy or DMA, so adding tests for these use cases
> are useful.
> 
> 
> @Bruce, you were also deeply involved in the DMA library, and probably
> have more up-to-date practical experience with it. Am I right that
> pktmbuf overhead in these tests provides more "real life use"-like
> results? Or am I completely off track with my thinking here, i.e. the
> pktmbuf overhead is only noise?
> 
I'm actually not that familiar with the dma-test application, so can't
comment on the specific overhead involved here. In the general case, if we
are just talking about the overhead of dereferencing the mbufs then I would
expect the overhead to be negligible. However, if we are looking to include
the cost of allocation and freeing of buffers, I'd try to avoid that as it
is a cost that would have to be paid for both SW copies and HW copies, so
should not count when calculating offload cost.

/Bruce


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 15:37       ` Bruce Richardson
@ 2023-12-12 17:13         ` Varghese, Vipin
  2023-12-12 18:09           ` Morten Brørup
  0 siblings, 1 reply; 16+ messages in thread
From: Varghese, Vipin @ 2023-12-12 17:13 UTC (permalink / raw)
  To: Bruce Richardson, Morten Brørup
  Cc: Yigit, Ferruh, dev, stable, honest.jiang, P, Thiyagarajan

[-- Attachment #1: Type: text/plain, Size: 4465 bytes --]

[Public]

Sharing a few critical points based on my exposure to the dma-perf application below

<Snipped>

On Tue, Dec 12, 2023 at 04:16:20PM +0100, Morten Brørup wrote:
> +TO: Bruce, please stop me if I'm completely off track here.
>
> > From: Ferruh Yigit [mailto:ferruh.yigit@amd.com] Sent: Tuesday, 12
> > December 2023 15.38
> >
> > On 12/12/2023 11:40 AM, Morten Brørup wrote:
> > >> From: Vipin Varghese [mailto:vipin.varghese@amd.com] Sent: Tuesday,
> > >> 12 December 2023 11.38
> > >>
> > >> Replace pktmbuf pool with mempool, this allows increase in MOPS
> > >> especially in lower buffer size. Using Mempool, allows to reduce the
> > >> extra CPU cycles.
> > >
> > > I get the point of this change: It tests the performance of copying
> > raw memory objects using respectively rte_memcpy and DMA, without the
> > mbuf indirection overhead.
> > >
> > > However, I still consider the existing test relevant: The performance
> > of copying packets using respectively rte_memcpy and DMA.
> > >
> >
> > This is DMA performance test application and packets are not used,
> > using pktmbuf just introduces overhead to the main focus of the
> > application.
> >
> > I am not sure if pktmuf selected intentionally for this test
> > application, but I assume it is there because of historical reasons.
>
> I think pktmbuf was selected intentionally, to provide more accurate
> results for application developers trying to determine when to use
> rte_memcpy and when to use DMA. Much like the "copy breakpoint" in Linux
> Ethernet drivers is used to determine which code path to take for each
> received packet.

yes Ferruh, this is the right understanding. In DPDK example we already have
dma-forward application which makes use of pktmbuf payload to copy over
new pktmbuf payload area.

by moving to mempool, we are actually now focusing on source and destination buffers.
This allows to create mempool objects with 2MB and 1GB src-dst areas. Thus allowing
to focus src to dst copy. With pktmbuf we were not able to achieve the same.


>
> Most applications will be working with pktmbufs, so these applications
> will also experience the pktmbuf overhead. Performance testing with the
> same overhead as the application will be better to help the application
> developer determine when to use rte_memcpy and when to use DMA when
> working with pktmbufs.

Morten thank you for the input, but as shared above DPDK example dma-fwd does
justice to such scenario. inline to test-compress-perf & test-crypto-perf IMHO test-dma-perf
should focus on getting best values of dma engine and memcpy comparision.

>
> (Furthermore, for the pktmbuf tests, I wonder if copying performance
> could also depend on IOVA mode and RTE_IOVA_IN_MBUF.)
>
> Nonetheless, there may also be use cases where raw mempool objects are
> being copied by rte_memcpy or DMA, so adding tests for these use cases
> are useful.
>
>
> @Bruce, you were also deeply involved in the DMA library, and probably
> have more up-to-date practical experience with it. Am I right that
> pktmbuf overhead in these tests provides more "real life use"-like
> results? Or am I completely off track with my thinking here, i.e. the
> pktmbuf overhead is only noise?
>
I'm actually not that familiar with the dma-test application, so can't
comment on the specific overhead involved here. In the general case, if we
are just talking about the overhead of dereferencing the mbufs then I would
expect the overhead to be negligible. However, if we are looking to include
the cost of allocation and freeing of buffers, I'd try to avoid that as it
is a cost that would have to be paid for both SW copies and HW copies, so
should not count when calculating offload cost.

Bruce, as per test-dma-perf there is no repeated pktmbuf-alloc or pktmbuf-free.
Hence I disagree that the overhead discussed for pkmbuf here is not related to alloc and free.
But the cost as per my investigation goes into fetching the cacheline and performing mtod on
each iteration.

/Bruce

I can rewrite the logic to make use pktmbuf objects by sending the src and dst with pre-computed
mtod to avoid the overhead. But this will not resolve the 2MB and 1GB huge page copy alloc failures.
IMHO, I believe in similar lines to other perf application, dma-perf application should focus on acutal device
performance over application application performance.

[-- Attachment #2: Type: text/html, Size: 9057 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 17:13         ` Varghese, Vipin
@ 2023-12-12 18:09           ` Morten Brørup
  2023-12-12 18:13             ` Varghese, Vipin
  0 siblings, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2023-12-12 18:09 UTC (permalink / raw)
  To: Varghese, Vipin, Bruce Richardson
  Cc: Yigit, Ferruh, dev, stable, honest.jiang, P, Thiyagarajan

[-- Attachment #1: Type: text/plain, Size: 4796 bytes --]

 

From: Varghese, Vipin [mailto:Vipin.Varghese@amd.com] 
Sent: Tuesday, 12 December 2023 18.14



Sharing a few critical points based on my exposure to the dma-perf application below

 

<Snipped>

On Tue, Dec 12, 2023 at 04:16:20PM +0100, Morten Brørup wrote:
> +TO: Bruce, please stop me if I'm completely off track here.
>
> > From: Ferruh Yigit [mailto:ferruh.yigit@amd.com] Sent: Tuesday, 12
> > December 2023 15.38
> >
> > On 12/12/2023 11:40 AM, Morten Brørup wrote:
> > >> From: Vipin Varghese [mailto:vipin.varghese@amd.com] Sent: Tuesday,
> > >> 12 December 2023 11.38
> > >>
> > >> Replace pktmbuf pool with mempool, this allows increase in MOPS
> > >> especially in lower buffer size. Using Mempool, allows to reduce the
> > >> extra CPU cycles.
> > >
> > > I get the point of this change: It tests the performance of copying
> > raw memory objects using respectively rte_memcpy and DMA, without the
> > mbuf indirection overhead.
> > >
> > > However, I still consider the existing test relevant: The performance
> > of copying packets using respectively rte_memcpy and DMA.
> > >
> >
> > This is DMA performance test application and packets are not used,
> > using pktmbuf just introduces overhead to the main focus of the
> > application.
> >
> > I am not sure if pktmuf selected intentionally for this test
> > application, but I assume it is there because of historical reasons.
>
> I think pktmbuf was selected intentionally, to provide more accurate
> results for application developers trying to determine when to use
> rte_memcpy and when to use DMA. Much like the "copy breakpoint" in Linux
> Ethernet drivers is used to determine which code path to take for each
> received packet.

 

yes Ferruh, this is the right understanding. In DPDK example we already have 

dma-forward application which makes use of pktmbuf payload to copy over

new pktmbuf payload area. 

 

by moving to mempool, we are actually now focusing on source and destination buffers.

This allows to create mempool objects with 2MB and 1GB src-dst areas. Thus allowing

to focus src to dst copy. With pktmbuf we were not able to achieve the same.

 


>
> Most applications will be working with pktmbufs, so these applications
> will also experience the pktmbuf overhead. Performance testing with the
> same overhead as the application will be better to help the application
> developer determine when to use rte_memcpy and when to use DMA when
> working with pktmbufs.

 

Morten thank you for the input, but as shared above DPDK example dma-fwd does 

justice to such scenario. inline to test-compress-perf & test-crypto-perf IMHO test-dma-perf

should focus on getting best values of dma engine and memcpy comparision.


>
> (Furthermore, for the pktmbuf tests, I wonder if copying performance
> could also depend on IOVA mode and RTE_IOVA_IN_MBUF.)
>
> Nonetheless, there may also be use cases where raw mempool objects are
> being copied by rte_memcpy or DMA, so adding tests for these use cases
> are useful.
>
>
> @Bruce, you were also deeply involved in the DMA library, and probably
> have more up-to-date practical experience with it. Am I right that
> pktmbuf overhead in these tests provides more "real life use"-like
> results? Or am I completely off track with my thinking here, i.e. the
> pktmbuf overhead is only noise?
>
I'm actually not that familiar with the dma-test application, so can't
comment on the specific overhead involved here. In the general case, if we
are just talking about the overhead of dereferencing the mbufs then I would
expect the overhead to be negligible. However, if we are looking to include
the cost of allocation and freeing of buffers, I'd try to avoid that as it
is a cost that would have to be paid for both SW copies and HW copies, so
should not count when calculating offload cost.

 

Bruce, as per test-dma-perf there is no repeated pktmbuf-alloc or pktmbuf-free. 

Hence I disagree that the overhead discussed for pkmbuf here is not related to alloc and free.

But the cost as per my investigation goes into fetching the cacheline and performing mtod on

each iteration.

/Bruce

I can rewrite the logic to make use pktmbuf objects by sending the src and dst with pre-computed 

mtod to avoid the overhead. But this will not resolve the 2MB and 1GB huge page copy alloc failures.

IMHO, I believe in similar lines to other perf application, dma-perf application should focus on acutal device

performance over application application performance.

 

[MB:]

OK, Vipin has multiple good arguments for this patch. I am convinced, let's proceed with it.

 

Acked-by: Morten Brørup <mb@smartsharesystems.com>

 


[-- Attachment #2: Type: text/html, Size: 10850 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 18:09           ` Morten Brørup
@ 2023-12-12 18:13             ` Varghese, Vipin
  2023-12-20  9:17               ` Varghese, Vipin
  0 siblings, 1 reply; 16+ messages in thread
From: Varghese, Vipin @ 2023-12-12 18:13 UTC (permalink / raw)
  To: Morten Brørup, Bruce Richardson
  Cc: Yigit, Ferruh, dev, stable, honest.jiang, P, Thiyagarajan

[-- Attachment #1: Type: text/plain, Size: 5469 bytes --]

[AMD Official Use Only - General]

Thank you Morten for the understanding
________________________________
From: Morten Brørup <mb@smartsharesystems.com>
Sent: 12 December 2023 23:39
To: Varghese, Vipin <Vipin.Varghese@amd.com>; Bruce Richardson <bruce.richardson@intel.com>
Cc: Yigit, Ferruh <Ferruh.Yigit@amd.com>; dev@dpdk.org <dev@dpdk.org>; stable@dpdk.org <stable@dpdk.org>; honest.jiang@foxmail.com <honest.jiang@foxmail.com>; P, Thiyagarajan <Thiyagarajan.P@amd.com>
Subject: RE: [PATCH] app/dma-perf: replace pktmbuf with mempool objects

Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.




From: Varghese, Vipin [mailto:Vipin.Varghese@amd.com]
Sent: Tuesday, 12 December 2023 18.14


Sharing a few critical points based on my exposure to the dma-perf application below



<Snipped>

On Tue, Dec 12, 2023 at 04:16:20PM +0100, Morten Brørup wrote:
> +TO: Bruce, please stop me if I'm completely off track here.
>
> > From: Ferruh Yigit [mailto:ferruh.yigit@amd.com] Sent: Tuesday, 12
> > December 2023 15.38
> >
> > On 12/12/2023 11:40 AM, Morten Brørup wrote:
> > >> From: Vipin Varghese [mailto:vipin.varghese@amd.com] Sent: Tuesday,
> > >> 12 December 2023 11.38
> > >>
> > >> Replace pktmbuf pool with mempool, this allows increase in MOPS
> > >> especially in lower buffer size. Using Mempool, allows to reduce the
> > >> extra CPU cycles.
> > >
> > > I get the point of this change: It tests the performance of copying
> > raw memory objects using respectively rte_memcpy and DMA, without the
> > mbuf indirection overhead.
> > >
> > > However, I still consider the existing test relevant: The performance
> > of copying packets using respectively rte_memcpy and DMA.
> > >
> >
> > This is DMA performance test application and packets are not used,
> > using pktmbuf just introduces overhead to the main focus of the
> > application.
> >
> > I am not sure if pktmuf selected intentionally for this test
> > application, but I assume it is there because of historical reasons.
>
> I think pktmbuf was selected intentionally, to provide more accurate
> results for application developers trying to determine when to use
> rte_memcpy and when to use DMA. Much like the "copy breakpoint" in Linux
> Ethernet drivers is used to determine which code path to take for each
> received packet.



yes Ferruh, this is the right understanding. In DPDK example we already have

dma-forward application which makes use of pktmbuf payload to copy over

new pktmbuf payload area.



by moving to mempool, we are actually now focusing on source and destination buffers.

This allows to create mempool objects with 2MB and 1GB src-dst areas. Thus allowing

to focus src to dst copy. With pktmbuf we were not able to achieve the same.



>
> Most applications will be working with pktmbufs, so these applications
> will also experience the pktmbuf overhead. Performance testing with the
> same overhead as the application will be better to help the application
> developer determine when to use rte_memcpy and when to use DMA when
> working with pktmbufs.



Morten thank you for the input, but as shared above DPDK example dma-fwd does

justice to such scenario. inline to test-compress-perf & test-crypto-perf IMHO test-dma-perf

should focus on getting best values of dma engine and memcpy comparision.

>
> (Furthermore, for the pktmbuf tests, I wonder if copying performance
> could also depend on IOVA mode and RTE_IOVA_IN_MBUF.)
>
> Nonetheless, there may also be use cases where raw mempool objects are
> being copied by rte_memcpy or DMA, so adding tests for these use cases
> are useful.
>
>
> @Bruce, you were also deeply involved in the DMA library, and probably
> have more up-to-date practical experience with it. Am I right that
> pktmbuf overhead in these tests provides more "real life use"-like
> results? Or am I completely off track with my thinking here, i.e. the
> pktmbuf overhead is only noise?
>
I'm actually not that familiar with the dma-test application, so can't
comment on the specific overhead involved here. In the general case, if we
are just talking about the overhead of dereferencing the mbufs then I would
expect the overhead to be negligible. However, if we are looking to include
the cost of allocation and freeing of buffers, I'd try to avoid that as it
is a cost that would have to be paid for both SW copies and HW copies, so
should not count when calculating offload cost.



Bruce, as per test-dma-perf there is no repeated pktmbuf-alloc or pktmbuf-free.

Hence I disagree that the overhead discussed for pkmbuf here is not related to alloc and free.

But the cost as per my investigation goes into fetching the cacheline and performing mtod on

each iteration.

/Bruce

I can rewrite the logic to make use pktmbuf objects by sending the src and dst with pre-computed

mtod to avoid the overhead. But this will not resolve the 2MB and 1GB huge page copy alloc failures.

IMHO, I believe in similar lines to other perf application, dma-perf application should focus on acutal device

performance over application application performance.



[MB:]

OK, Vipin has multiple good arguments for this patch. I am convinced, let’s proceed with it.



Acked-by: Morten Brørup <mb@smartsharesystems.com>



[-- Attachment #2: Type: text/html, Size: 12540 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 18:13             ` Varghese, Vipin
@ 2023-12-20  9:17               ` Varghese, Vipin
  2023-12-20  9:21                 ` David Marchand
  0 siblings, 1 reply; 16+ messages in thread
From: Varghese, Vipin @ 2023-12-20  9:17 UTC (permalink / raw)
  To: Morten Brørup, Bruce Richardson
  Cc: Yigit, Ferruh, dev, stable, honest.jiang, P, Thiyagarajan

[-- Attachment #1: Type: text/plain, Size: 5780 bytes --]

just received an update marking this as `Superseded`. I will send again 
with `ACK` also


Thank you Morten for the understanding

> ------------------------------------------------------------------------
> *From:* Morten Brørup <mb@smartsharesystems.com>
> *Sent:* 12 December 2023 23:39
> *To:* Varghese, Vipin <Vipin.Varghese@amd.com>; Bruce Richardson 
> <bruce.richardson@intel.com>
> *Cc:* Yigit, Ferruh <Ferruh.Yigit@amd.com>; dev@dpdk.org 
> <dev@dpdk.org>; stable@dpdk.org <stable@dpdk.org>; 
> honest.jiang@foxmail.com <honest.jiang@foxmail.com>; P, Thiyagarajan 
> <Thiyagarajan.P@amd.com>
> *Subject:* RE: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
>
> 	
> Caution: This message originated from an External Source. Use proper 
> caution when opening attachments, clicking links, or responding.
>
>
> *From:*Varghese, Vipin [mailto:Vipin.Varghese@amd.com]
> *Sent:* Tuesday, 12 December 2023 18.14
>
> Sharing a few critical points based on my exposure to the dma-perf 
> application below
>
> <Snipped>
>
> On Tue, Dec 12, 2023 at 04:16:20PM +0100, Morten Brørup wrote:
> > +TO: Bruce, please stop me if I'm completely off track here.
> >
> > > From: Ferruh Yigit [mailto:ferruh.yigit@amd.com 
> <mailto:ferruh.yigit@amd.com>] Sent: Tuesday, 12
> > > December 2023 15.38
> > >
> > > On 12/12/2023 11:40 AM, Morten Brørup wrote:
> > > >> From: Vipin Varghese [mailto:vipin.varghese@amd.com 
> <mailto:vipin.varghese@amd.com>] Sent: Tuesday,
> > > >> 12 December 2023 11.38
> > > >>
> > > >> Replace pktmbuf pool with mempool, this allows increase in MOPS
> > > >> especially in lower buffer size. Using Mempool, allows to 
> reduce the
> > > >> extra CPU cycles.
> > > >
> > > > I get the point of this change: It tests the performance of copying
> > > raw memory objects using respectively rte_memcpy and DMA, without the
> > > mbuf indirection overhead.
> > > >
> > > > However, I still consider the existing test relevant: The 
> performance
> > > of copying packets using respectively rte_memcpy and DMA.
> > > >
> > >
> > > This is DMA performance test application and packets are not used,
> > > using pktmbuf just introduces overhead to the main focus of the
> > > application.
> > >
> > > I am not sure if pktmuf selected intentionally for this test
> > > application, but I assume it is there because of historical reasons.
> >
> > I think pktmbuf was selected intentionally, to provide more accurate
> > results for application developers trying to determine when to use
> > rte_memcpy and when to use DMA. Much like the "copy breakpoint" in Linux
> > Ethernet drivers is used to determine which code path to take for each
> > received packet.
>
> yes Ferruh, this is the right understanding. In DPDK example we 
> already have
>
> dma-forward application which makes use of pktmbuf payload to copy over
>
> new pktmbuf payload area.
>
> by moving to mempool, we are actually now focusing on source and 
> destination buffers.
>
> This allows to create mempool objects with 2MB and 1GB src-dst areas. 
> Thus allowing
>
> to focus src to dst copy. With pktmbuf we were not able to achieve the 
> same.
>
>
> >
> > Most applications will be working with pktmbufs, so these applications
> > will also experience the pktmbuf overhead. Performance testing with the
> > same overhead as the application will be better to help the application
> > developer determine when to use rte_memcpy and when to use DMA when
> > working with pktmbufs.
>
> Morten thank you for the input, but as shared above DPDK example 
> dma-fwd does
>
> justice to such scenario. inline to test-compress-perf & 
> test-crypto-perf IMHO test-dma-perf
>
> should focus on getting best values of dma engine and memcpy comparision.
>
>
> >
> > (Furthermore, for the pktmbuf tests, I wonder if copying performance
> > could also depend on IOVA mode and RTE_IOVA_IN_MBUF.)
> >
> > Nonetheless, there may also be use cases where raw mempool objects are
> > being copied by rte_memcpy or DMA, so adding tests for these use cases
> > are useful.
> >
> >
> > @Bruce, you were also deeply involved in the DMA library, and probably
> > have more up-to-date practical experience with it. Am I right that
> > pktmbuf overhead in these tests provides more "real life use"-like
> > results? Or am I completely off track with my thinking here, i.e. the
> > pktmbuf overhead is only noise?
> >
> I'm actually not that familiar with the dma-test application, so can't
> comment on the specific overhead involved here. In the general case, if we
> are just talking about the overhead of dereferencing the mbufs then I 
> would
> expect the overhead to be negligible. However, if we are looking to 
> include
> the cost of allocation and freeing of buffers, I'd try to avoid that as it
> is a cost that would have to be paid for both SW copies and HW copies, so
> should not count when calculating offload cost.
>
> Bruce, as per test-dma-perf there is no repeated pktmbuf-alloc or 
> pktmbuf-free.
>
> Hence I disagree that the overhead discussed for pkmbuf here is not 
> related to alloc and free.
>
> But the cost as per my investigation goes into fetching the cacheline 
> and performing mtod on
>
> each iteration.
>
> /Bruce
>
> I can rewrite the logic to make use pktmbuf objects by sending the src 
> and dst with pre-computed
>
> mtod to avoid the overhead. But this will not resolve the 2MB and 1GB 
> huge page copy alloc failures.
>
> IMHO, I believe in similar lines to other perf application, dma-perf 
> application should focus on acutal device
>
> performance over application application performance.
>
> [MB:]
>
> OK, Vipin has multiple good arguments for this patch. I am convinced, 
> let’s proceed with it.
>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>

[-- Attachment #2: Type: text/html, Size: 20546 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-20  9:17               ` Varghese, Vipin
@ 2023-12-20  9:21                 ` David Marchand
  0 siblings, 0 replies; 16+ messages in thread
From: David Marchand @ 2023-12-20  9:21 UTC (permalink / raw)
  To: Varghese, Vipin
  Cc: Morten Brørup, Bruce Richardson, Yigit, Ferruh, dev, stable,
	honest.jiang, P, Thiyagarajan

On Wed, Dec 20, 2023 at 10:18 AM Varghese, Vipin <vipin.varghese@amd.com> wrote:
>
> just received an update marking this as `Superseded`. I will send again with `ACK` also

I saw multiple versions of a patch, with the last one marked as
deferred, so I cleaned the previous revision as superseded.
There are many instances when people are not cleaning their stuff in patchwork.

If something is wrong with it, then please fix yourself.
Thanks.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-12 10:37 [PATCH] app/dma-perf: replace pktmbuf with mempool objects Vipin Varghese
  2023-12-12 11:40 ` Morten Brørup
@ 2023-12-20 11:03 ` Vipin Varghese
  2024-02-26  2:05   ` fengchengwen
  1 sibling, 1 reply; 16+ messages in thread
From: Vipin Varghese @ 2023-12-20 11:03 UTC (permalink / raw)
  To: dev, david.marchand, honest.jiang
  Cc: Vipin Varghese, Morten Brørup, Thiyagrajan P

From: Vipin Varghese <Vipin.Varghese@amd.com>

Replace pktmbuf pool with mempool, this allows increase in MOPS
especially in lower buffer size. Using Mempool, allows to reduce
the extra CPU cycles.

Changes made are
1. pktmbuf pool create with mempool create.
2. create src & dst pointer array from the appropaite numa.
3. use get pool and put for mempool objects.
4. remove pktmbuf_mtod for dma and cpu memcpy.

v2 changes:
 - add ACK from  Morten Brørup

v1 changes:
 - pktmbuf pool create with mempool create.
 - create src & dst pointer array from the appropaite numa.
 - use get pool and put for mempool objects.
 - remove pktmbuf_mtod for dma and cpu memcpy.

Test Results for pktmbuf vs mempool:
====================================

Format: Buffer Size | % AVG cycles | % AVG Gbps

Category-1: HW-DSA
-------------------
  64|-13.11| 14.97
 128|-41.49|  0.41
 256| -1.85|  1.20
 512| -9.38|  8.81
1024|  1.82| -2.00
1518|  0.00| -0.80
2048|  1.03| -0.91
4096|  0.00| -0.35
8192|  0.07| -0.08

Category-2: MEMCPY
-------------------
  64|-12.50|14.14
 128|-40.63|67.26
 256|-38.78|59.35
 512|-30.26|43.36
1024|-21.80|27.04
1518|-16.23|19.33
2048|-14.75|16.81
4096| -9.56|10.01
8192| -3.32| 3.12

Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
---
---
 app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..dc6f16cc01 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -43,8 +43,8 @@ struct lcore_params {
 	uint16_t kick_batch;
 	uint32_t buf_size;
 	uint16_t test_secs;
-	struct rte_mbuf **srcs;
-	struct rte_mbuf **dsts;
+	void **srcs;
+	void **dsts;
 	volatile struct worker_info worker_info;
 };
 
@@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
 }
 
 static inline void
-cache_flush_buf(__rte_unused struct rte_mbuf **array,
+cache_flush_buf(__rte_unused void **array,
 		__rte_unused uint32_t buf_size,
 		__rte_unused uint32_t nr_buf)
 {
 #ifdef RTE_ARCH_X86_64
 	char *data;
-	struct rte_mbuf **srcs = array;
+	void **srcs = array;
 	uint32_t i, offset;
 
 	for (i = 0; i < nr_buf; i++) {
-		data = rte_pktmbuf_mtod(srcs[i], char *);
+		data = (char *) srcs[i];
 		for (offset = 0; offset < buf_size; offset += 64)
 			__builtin_ia32_clflush(data + offset);
 	}
@@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
 	const uint32_t nr_buf = para->nr_buf;
 	const uint16_t kick_batch = para->kick_batch;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint16_t nr_cpl;
 	uint64_t async_cnt = 0;
 	uint32_t i;
@@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
 dma_copy:
-			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
-				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			ret = rte_dma_copy(dev_id,
+					0,
+					(rte_iova_t) srcs[i],
+					(rte_iova_t) dsts[i],
+					buf_size,
+					0);
 			if (unlikely(ret < 0)) {
 				if (ret == -ENOSPC) {
 					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
@@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
 	volatile struct worker_info *worker_info = &(para->worker_info);
 	const uint32_t nr_buf = para->nr_buf;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint32_t i;
 
 	worker_info->stop_flag = false;
@@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
 
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
-			const void *src = rte_pktmbuf_mtod(dsts[i], void *);
-			void *dst = rte_pktmbuf_mtod(srcs[i], void *);
+			const void *src = (void *) dsts[i];
+			void *dst = (void *) srcs[i];
 
 			/* copy buffer form src to dst */
 			rte_memcpy(dst, src, (size_t)buf_size);
@@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg, void ***srcs,
+			void ***dsts)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
@@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
-	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+	src_pool = rte_mempool_create("Benchmark_DMA_SRC",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->src_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->src_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
 		return -1;
 	}
 
-	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+	dst_pool = rte_mempool_create("Benchmark_DMA_DST",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->dst_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->dst_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
 		return -1;
 	}
 
-	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
 	if (*srcs == NULL) {
 		printf("Error: srcs malloc failed.\n");
 		return -1;
 	}
 
-	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
 	if (*dsts == NULL) {
 		printf("Error: dsts malloc failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
-		printf("alloc src mbufs failed.\n");
+	if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src bufs failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
-		printf("alloc dst mbufs failed.\n");
+	if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst bufs failed.\n");
 		return -1;
 	}
 
@@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint16_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	void **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 out:
 	/* free mbufs used in the test */
 	if (srcs != NULL)
-		rte_pktmbuf_free_bulk(srcs, nr_buf);
+		rte_mempool_put_bulk(src_pool, srcs, nr_buf);
 	if (dsts != NULL)
-		rte_pktmbuf_free_bulk(dsts, nr_buf);
+		rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
 
 	/* free the points for the mbufs */
 	rte_free(srcs);
-- 
2.34.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] app/dma-perf: replace pktmbuf with mempool objects
  2023-12-20 11:03 ` [PATCH v2] " Vipin Varghese
@ 2024-02-26  2:05   ` fengchengwen
  2024-02-27  9:57     ` Varghese, Vipin
  0 siblings, 1 reply; 16+ messages in thread
From: fengchengwen @ 2024-02-26  2:05 UTC (permalink / raw)
  To: Vipin Varghese, dev, david.marchand, honest.jiang
  Cc: Morten Brørup, Thiyagrajan P, Ferruh Yigit

Hi Vipin,

On 2023/12/20 19:03, Vipin Varghese wrote:
> From: Vipin Varghese <Vipin.Varghese@amd.com>
> 
> Replace pktmbuf pool with mempool, this allows increase in MOPS
> especially in lower buffer size. Using Mempool, allows to reduce
> the extra CPU cycles.
> 
> Changes made are
> 1. pktmbuf pool create with mempool create.
> 2. create src & dst pointer array from the appropaite numa.
> 3. use get pool and put for mempool objects.
> 4. remove pktmbuf_mtod for dma and cpu memcpy.
> 
> v2 changes:
>  - add ACK from  Morten Brørup
> 
> v1 changes:
>  - pktmbuf pool create with mempool create.
>  - create src & dst pointer array from the appropaite numa.
>  - use get pool and put for mempool objects.
>  - remove pktmbuf_mtod for dma and cpu memcpy.
> 
> Test Results for pktmbuf vs mempool:
> ====================================
> 
> Format: Buffer Size | % AVG cycles | % AVG Gbps
> 
> Category-1: HW-DSA
> -------------------
>   64|-13.11| 14.97
>  128|-41.49|  0.41
>  256| -1.85|  1.20
>  512| -9.38|  8.81
> 1024|  1.82| -2.00
> 1518|  0.00| -0.80
> 2048|  1.03| -0.91
> 4096|  0.00| -0.35
> 8192|  0.07| -0.08
> 
> Category-2: MEMCPY
> -------------------
>   64|-12.50|14.14
>  128|-40.63|67.26
>  256|-38.78|59.35
>  512|-30.26|43.36
> 1024|-21.80|27.04
> 1518|-16.23|19.33
> 2048|-14.75|16.81
> 4096| -9.56|10.01
> 8192| -3.32| 3.12
> 
> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
> ---
> ---
>  app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>  1 file changed, 44 insertions(+), 30 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 9b1f58c78c..dc6f16cc01 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -43,8 +43,8 @@ struct lcore_params {
>  	uint16_t kick_batch;
>  	uint32_t buf_size;
>  	uint16_t test_secs;
> -	struct rte_mbuf **srcs;
> -	struct rte_mbuf **dsts;
> +	void **srcs;
> +	void **dsts;
>  	volatile struct worker_info worker_info;
>  };
>  
> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>  }
>  
>  static inline void
> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
> +cache_flush_buf(__rte_unused void **array,
>  		__rte_unused uint32_t buf_size,
>  		__rte_unused uint32_t nr_buf)
>  {
>  #ifdef RTE_ARCH_X86_64
>  	char *data;
> -	struct rte_mbuf **srcs = array;
> +	void **srcs = array;
>  	uint32_t i, offset;
>  
>  	for (i = 0; i < nr_buf; i++) {
> -		data = rte_pktmbuf_mtod(srcs[i], char *);
> +		data = (char *) srcs[i];
>  		for (offset = 0; offset < buf_size; offset += 64)
>  			__builtin_ia32_clflush(data + offset);
>  	}
> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>  	const uint32_t nr_buf = para->nr_buf;
>  	const uint16_t kick_batch = para->kick_batch;
>  	const uint32_t buf_size = para->buf_size;
> -	struct rte_mbuf **srcs = para->srcs;
> -	struct rte_mbuf **dsts = para->dsts;
> +	void **srcs = para->srcs;
> +	void **dsts = para->dsts;
>  	uint16_t nr_cpl;
>  	uint64_t async_cnt = 0;
>  	uint32_t i;
> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>  	while (1) {
>  		for (i = 0; i < nr_buf; i++) {
>  dma_copy:
> -			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
> -				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
> +			ret = rte_dma_copy(dev_id,
> +					0,
> +					(rte_iova_t) srcs[i],
> +					(rte_iova_t) dsts[i],

should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
but this commit is mainly to eliminate the address convert overload, so we
should prepare IOVA for DMA copy, and VA for memory copy.

I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
then direct use them in do_xxx_mem_copy。

Thanks.

> +					buf_size,
> +					0);
>  			if (unlikely(ret < 0)) {
>  				if (ret == -ENOSPC) {
>  					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>  	volatile struct worker_info *worker_info = &(para->worker_info);
>  	const uint32_t nr_buf = para->nr_buf;
>  	const uint32_t buf_size = para->buf_size;
> -	struct rte_mbuf **srcs = para->srcs;
> -	struct rte_mbuf **dsts = para->dsts;
> +	void **srcs = para->srcs;
> +	void **dsts = para->dsts;
>  	uint32_t i;
>  
>  	worker_info->stop_flag = false;
> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>  
>  	while (1) {
>  		for (i = 0; i < nr_buf; i++) {
> -			const void *src = rte_pktmbuf_mtod(dsts[i], void *);
> -			void *dst = rte_pktmbuf_mtod(srcs[i], void *);
> +			const void *src = (void *) dsts[i];
> +			void *dst = (void *) srcs[i];
>  
>  			/* copy buffer form src to dst */
>  			rte_memcpy(dst, src, (size_t)buf_size);
> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>  }
>  
>  static int
> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> -			struct rte_mbuf ***dsts)
> +setup_memory_env(struct test_configure *cfg, void ***srcs,
> +			void ***dsts)
>  {
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	unsigned int nr_sockets;
> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		return -1;
>  	}
>  
> -	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> +	src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>  			nr_buf,
> +			buf_size,
>  			0,
>  			0,
> -			buf_size + RTE_PKTMBUF_HEADROOM,
> -			cfg->src_numa_node);
> +			NULL,
> +			NULL,
> +			NULL,
> +			NULL,
> +			cfg->src_numa_node,
> +			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>  	if (src_pool == NULL) {
>  		PRINT_ERR("Error with source mempool creation.\n");
>  		return -1;
>  	}
>  
> -	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> +	dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>  			nr_buf,
> +			buf_size,
>  			0,
>  			0,
> -			buf_size + RTE_PKTMBUF_HEADROOM,
> -			cfg->dst_numa_node);
> +			NULL,
> +			NULL,
> +			NULL,
> +			NULL,
> +			cfg->dst_numa_node,
> +			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>  	if (dst_pool == NULL) {
>  		PRINT_ERR("Error with destination mempool creation.\n");
>  		return -1;
>  	}
>  
> -	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	*srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>  	if (*srcs == NULL) {
>  		printf("Error: srcs malloc failed.\n");
>  		return -1;
>  	}
>  
> -	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	*dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>  	if (*dsts == NULL) {
>  		printf("Error: dsts malloc failed.\n");
>  		return -1;
>  	}
>  
> -	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
> -		printf("alloc src mbufs failed.\n");
> +	if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
> +		printf("alloc src bufs failed.\n");
>  		return -1;
>  	}
>  
> -	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
> -		printf("alloc dst mbufs failed.\n");
> +	if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
> +		printf("alloc dst bufs failed.\n");
>  		return -1;
>  	}
>  
> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	uint16_t i;
>  	uint32_t offset;
>  	unsigned int lcore_id = 0;
> -	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	void **srcs = NULL, **dsts = NULL;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	uint16_t kick_batch = cfg->kick_batch.cur;
> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  out:
>  	/* free mbufs used in the test */
>  	if (srcs != NULL)
> -		rte_pktmbuf_free_bulk(srcs, nr_buf);
> +		rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>  	if (dsts != NULL)
> -		rte_pktmbuf_free_bulk(dsts, nr_buf);
> +		rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>  
>  	/* free the points for the mbufs */
>  	rte_free(srcs);
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] app/dma-perf: replace pktmbuf with mempool objects
  2024-02-26  2:05   ` fengchengwen
@ 2024-02-27  9:57     ` Varghese, Vipin
  2024-02-27 12:27       ` fengchengwen
  0 siblings, 1 reply; 16+ messages in thread
From: Varghese, Vipin @ 2024-02-27  9:57 UTC (permalink / raw)
  To: fengchengwen, dev, david.marchand, honest.jiang
  Cc: Morten Brørup, Thiyagrajan P, Ferruh Yigit


On 2/26/2024 7:35 AM, fengchengwen wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Hi Vipin,
>
> On 2023/12/20 19:03, Vipin Varghese wrote:
>> From: Vipin Varghese <Vipin.Varghese@amd.com>
>>
>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>> especially in lower buffer size. Using Mempool, allows to reduce
>> the extra CPU cycles.
>>
>> Changes made are
>> 1. pktmbuf pool create with mempool create.
>> 2. create src & dst pointer array from the appropaite numa.
>> 3. use get pool and put for mempool objects.
>> 4. remove pktmbuf_mtod for dma and cpu memcpy.
>>
>> v2 changes:
>>   - add ACK from  Morten Brørup
>>
>> v1 changes:
>>   - pktmbuf pool create with mempool create.
>>   - create src & dst pointer array from the appropaite numa.
>>   - use get pool and put for mempool objects.
>>   - remove pktmbuf_mtod for dma and cpu memcpy.
>>
>> Test Results for pktmbuf vs mempool:
>> ====================================
>>
>> Format: Buffer Size | % AVG cycles | % AVG Gbps
>>
>> Category-1: HW-DSA
>> -------------------
>>    64|-13.11| 14.97
>>   128|-41.49|  0.41
>>   256| -1.85|  1.20
>>   512| -9.38|  8.81
>> 1024|  1.82| -2.00
>> 1518|  0.00| -0.80
>> 2048|  1.03| -0.91
>> 4096|  0.00| -0.35
>> 8192|  0.07| -0.08
>>
>> Category-2: MEMCPY
>> -------------------
>>    64|-12.50|14.14
>>   128|-40.63|67.26
>>   256|-38.78|59.35
>>   512|-30.26|43.36
>> 1024|-21.80|27.04
>> 1518|-16.23|19.33
>> 2048|-14.75|16.81
>> 4096| -9.56|10.01
>> 8192| -3.32| 3.12
>>
>> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
>> ---
>> ---
>>   app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>>   1 file changed, 44 insertions(+), 30 deletions(-)
>>
>> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
>> index 9b1f58c78c..dc6f16cc01 100644
>> --- a/app/test-dma-perf/benchmark.c
>> +++ b/app/test-dma-perf/benchmark.c
>> @@ -43,8 +43,8 @@ struct lcore_params {
>>        uint16_t kick_batch;
>>        uint32_t buf_size;
>>        uint16_t test_secs;
>> -     struct rte_mbuf **srcs;
>> -     struct rte_mbuf **dsts;
>> +     void **srcs;
>> +     void **dsts;
>>        volatile struct worker_info worker_info;
>>   };
>>
>> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>>   }
>>
>>   static inline void
>> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
>> +cache_flush_buf(__rte_unused void **array,
>>                __rte_unused uint32_t buf_size,
>>                __rte_unused uint32_t nr_buf)
>>   {
>>   #ifdef RTE_ARCH_X86_64
>>        char *data;
>> -     struct rte_mbuf **srcs = array;
>> +     void **srcs = array;
>>        uint32_t i, offset;
>>
>>        for (i = 0; i < nr_buf; i++) {
>> -             data = rte_pktmbuf_mtod(srcs[i], char *);
>> +             data = (char *) srcs[i];
>>                for (offset = 0; offset < buf_size; offset += 64)
>>                        __builtin_ia32_clflush(data + offset);
>>        }
>> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>>        const uint32_t nr_buf = para->nr_buf;
>>        const uint16_t kick_batch = para->kick_batch;
>>        const uint32_t buf_size = para->buf_size;
>> -     struct rte_mbuf **srcs = para->srcs;
>> -     struct rte_mbuf **dsts = para->dsts;
>> +     void **srcs = para->srcs;
>> +     void **dsts = para->dsts;
>>        uint16_t nr_cpl;
>>        uint64_t async_cnt = 0;
>>        uint32_t i;
>> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>>        while (1) {
>>                for (i = 0; i < nr_buf; i++) {
>>   dma_copy:
>> -                     ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
>> -                             rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>> +                     ret = rte_dma_copy(dev_id,
>> +                                     0,
>> +                                     (rte_iova_t) srcs[i],
>> +                                     (rte_iova_t) dsts[i],
Thank you ChengWen for the suggestion, please find my observations below
> should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
> but this commit is mainly to eliminate the address convert overload, so we
> should prepare IOVA for DMA copy, and VA for memory copy.

yes, Ferruh helped me to understand. Please let me look into this and 
share a v3 soon.

>
> I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
> then direct use them in do_xxx_mem_copy。

Please help me understand if you are suggesting, in function 
`setup_memory_env` we still keep pkt_mbuf creation.

But when the arrays are created instead of populating them with mbuf, we 
directly call `pktmbuf_mtod` and store the

starting address. Thus in cpu-copy or dma-copy we do not spent time in 
compute. Is this what you mean?


My reasoning for not using pktmbuf is as follows

1. pkt_mbuf has rte_mbuf metadata + private + headroom + tailroom

2. so when create payload for 2K, 4K, 8K, 16K, 32K, 1GB we are 
accounting for extra headroom. which is not efficent

3. dma-perf is targeted for performance and not network function.

4. there is an existing example which makes use pktmbuf and dma calls.


hence I would like to use mempool which also helps per numa with flags.


>
> Thanks.
>
>> +                                     buf_size,
>> +                                     0);
>>                        if (unlikely(ret < 0)) {
>>                                if (ret == -ENOSPC) {
>>                                        do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
>> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>>        volatile struct worker_info *worker_info = &(para->worker_info);
>>        const uint32_t nr_buf = para->nr_buf;
>>        const uint32_t buf_size = para->buf_size;
>> -     struct rte_mbuf **srcs = para->srcs;
>> -     struct rte_mbuf **dsts = para->dsts;
>> +     void **srcs = para->srcs;
>> +     void **dsts = para->dsts;
>>        uint32_t i;
>>
>>        worker_info->stop_flag = false;
>> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>>
>>        while (1) {
>>                for (i = 0; i < nr_buf; i++) {
>> -                     const void *src = rte_pktmbuf_mtod(dsts[i], void *);
>> -                     void *dst = rte_pktmbuf_mtod(srcs[i], void *);
>> +                     const void *src = (void *) dsts[i];
>> +                     void *dst = (void *) srcs[i];
>>
>>                        /* copy buffer form src to dst */
>>                        rte_memcpy(dst, src, (size_t)buf_size);
>> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>>   }
>>
>>   static int
>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>> -                     struct rte_mbuf ***dsts)
>> +setup_memory_env(struct test_configure *cfg, void ***srcs,
>> +                     void ***dsts)
>>   {
>>        unsigned int buf_size = cfg->buf_size.cur;
>>        unsigned int nr_sockets;
>> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>                return -1;
>>        }
>>
>> -     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
>> +     src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>>                        nr_buf,
>> +                     buf_size,
>>                        0,
>>                        0,
>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>> -                     cfg->src_numa_node);
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     cfg->src_numa_node,
>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>        if (src_pool == NULL) {
>>                PRINT_ERR("Error with source mempool creation.\n");
>>                return -1;
>>        }
>>
>> -     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
>> +     dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>>                        nr_buf,
>> +                     buf_size,
>>                        0,
>>                        0,
>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>> -                     cfg->dst_numa_node);
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     cfg->dst_numa_node,
>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>        if (dst_pool == NULL) {
>>                PRINT_ERR("Error with destination mempool creation.\n");
>>                return -1;
>>        }
>>
>> -     *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>> +     *srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>>        if (*srcs == NULL) {
>>                printf("Error: srcs malloc failed.\n");
>>                return -1;
>>        }
>>
>> -     *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>> +     *dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>>        if (*dsts == NULL) {
>>                printf("Error: dsts malloc failed.\n");
>>                return -1;
>>        }
>>
>> -     if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
>> -             printf("alloc src mbufs failed.\n");
>> +     if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
>> +             printf("alloc src bufs failed.\n");
>>                return -1;
>>        }
>>
>> -     if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
>> -             printf("alloc dst mbufs failed.\n");
>> +     if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
>> +             printf("alloc dst bufs failed.\n");
>>                return -1;
>>        }
>>
>> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>        uint16_t i;
>>        uint32_t offset;
>>        unsigned int lcore_id = 0;
>> -     struct rte_mbuf **srcs = NULL, **dsts = NULL;
>> +     void **srcs = NULL, **dsts = NULL;
>>        struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>        unsigned int buf_size = cfg->buf_size.cur;
>>        uint16_t kick_batch = cfg->kick_batch.cur;
>> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>   out:
>>        /* free mbufs used in the test */
>>        if (srcs != NULL)
>> -             rte_pktmbuf_free_bulk(srcs, nr_buf);
>> +             rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>>        if (dsts != NULL)
>> -             rte_pktmbuf_free_bulk(dsts, nr_buf);
>> +             rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>>
>>        /* free the points for the mbufs */
>>        rte_free(srcs);
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] app/dma-perf: replace pktmbuf with mempool objects
  2024-02-27  9:57     ` Varghese, Vipin
@ 2024-02-27 12:27       ` fengchengwen
  2024-02-28  3:08         ` Varghese, Vipin
  0 siblings, 1 reply; 16+ messages in thread
From: fengchengwen @ 2024-02-27 12:27 UTC (permalink / raw)
  To: Varghese, Vipin, dev, david.marchand, honest.jiang
  Cc: Morten Brørup, Thiyagrajan P, Ferruh Yigit

Hi Vipin,

On 2024/2/27 17:57, Varghese, Vipin wrote:
> 
> On 2/26/2024 7:35 AM, fengchengwen wrote:
>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>
>>
>> Hi Vipin,
>>
>> On 2023/12/20 19:03, Vipin Varghese wrote:
>>> From: Vipin Varghese <Vipin.Varghese@amd.com>
>>>
>>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>>> especially in lower buffer size. Using Mempool, allows to reduce
>>> the extra CPU cycles.
>>>
>>> Changes made are
>>> 1. pktmbuf pool create with mempool create.
>>> 2. create src & dst pointer array from the appropaite numa.
>>> 3. use get pool and put for mempool objects.
>>> 4. remove pktmbuf_mtod for dma and cpu memcpy.
>>>
>>> v2 changes:
>>>   - add ACK from  Morten Brørup
>>>
>>> v1 changes:
>>>   - pktmbuf pool create with mempool create.
>>>   - create src & dst pointer array from the appropaite numa.
>>>   - use get pool and put for mempool objects.
>>>   - remove pktmbuf_mtod for dma and cpu memcpy.
>>>
>>> Test Results for pktmbuf vs mempool:
>>> ====================================
>>>
>>> Format: Buffer Size | % AVG cycles | % AVG Gbps
>>>
>>> Category-1: HW-DSA
>>> -------------------
>>>    64|-13.11| 14.97
>>>   128|-41.49|  0.41
>>>   256| -1.85|  1.20
>>>   512| -9.38|  8.81
>>> 1024|  1.82| -2.00
>>> 1518|  0.00| -0.80
>>> 2048|  1.03| -0.91
>>> 4096|  0.00| -0.35
>>> 8192|  0.07| -0.08
>>>
>>> Category-2: MEMCPY
>>> -------------------
>>>    64|-12.50|14.14
>>>   128|-40.63|67.26
>>>   256|-38.78|59.35
>>>   512|-30.26|43.36
>>> 1024|-21.80|27.04
>>> 1518|-16.23|19.33
>>> 2048|-14.75|16.81
>>> 4096| -9.56|10.01
>>> 8192| -3.32| 3.12
>>>
>>> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>>> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
>>> ---
>>> ---
>>>   app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>>>   1 file changed, 44 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
>>> index 9b1f58c78c..dc6f16cc01 100644
>>> --- a/app/test-dma-perf/benchmark.c
>>> +++ b/app/test-dma-perf/benchmark.c
>>> @@ -43,8 +43,8 @@ struct lcore_params {
>>>        uint16_t kick_batch;
>>>        uint32_t buf_size;
>>>        uint16_t test_secs;
>>> -     struct rte_mbuf **srcs;
>>> -     struct rte_mbuf **dsts;
>>> +     void **srcs;
>>> +     void **dsts;
>>>        volatile struct worker_info worker_info;
>>>   };
>>>
>>> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>>>   }
>>>
>>>   static inline void
>>> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
>>> +cache_flush_buf(__rte_unused void **array,
>>>                __rte_unused uint32_t buf_size,
>>>                __rte_unused uint32_t nr_buf)
>>>   {
>>>   #ifdef RTE_ARCH_X86_64
>>>        char *data;
>>> -     struct rte_mbuf **srcs = array;
>>> +     void **srcs = array;
>>>        uint32_t i, offset;
>>>
>>>        for (i = 0; i < nr_buf; i++) {
>>> -             data = rte_pktmbuf_mtod(srcs[i], char *);
>>> +             data = (char *) srcs[i];
>>>                for (offset = 0; offset < buf_size; offset += 64)
>>>                        __builtin_ia32_clflush(data + offset);
>>>        }
>>> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>>>        const uint32_t nr_buf = para->nr_buf;
>>>        const uint16_t kick_batch = para->kick_batch;
>>>        const uint32_t buf_size = para->buf_size;
>>> -     struct rte_mbuf **srcs = para->srcs;
>>> -     struct rte_mbuf **dsts = para->dsts;
>>> +     void **srcs = para->srcs;
>>> +     void **dsts = para->dsts;
>>>        uint16_t nr_cpl;
>>>        uint64_t async_cnt = 0;
>>>        uint32_t i;
>>> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>>>        while (1) {
>>>                for (i = 0; i < nr_buf; i++) {
>>>   dma_copy:
>>> -                     ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
>>> -                             rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>>> +                     ret = rte_dma_copy(dev_id,
>>> +                                     0,
>>> +                                     (rte_iova_t) srcs[i],
>>> +                                     (rte_iova_t) dsts[i],
> Thank you ChengWen for the suggestion, please find my observations below
>> should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
>> but this commit is mainly to eliminate the address convert overload, so we
>> should prepare IOVA for DMA copy, and VA for memory copy.
> 
> yes, Ferruh helped me to understand. Please let me look into this and share a v3 soon.
> 
>>
>> I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
>> then direct use them in do_xxx_mem_copy。
> 
> Please help me understand if you are suggesting, in function `setup_memory_env` we still keep pkt_mbuf creation.
> 
> But when the arrays are created instead of populating them with mbuf, we directly call `pktmbuf_mtod` and store the
> 
> starting address. Thus in cpu-copy or dma-copy we do not spent time in compute. Is this what you mean?

Yes

> 
> 
> My reasoning for not using pktmbuf is as follows
> 
> 1. pkt_mbuf has rte_mbuf metadata + private + headroom + tailroom
> 
> 2. so when create payload for 2K, 4K, 8K, 16K, 32K, 1GB we are accounting for extra headroom. which is not efficent
> 
> 3. dma-perf is targeted for performance and not network function.
> 
> 4. there is an existing example which makes use pktmbuf and dma calls.
> 
> 
> hence I would like to use mempool which also helps per numa with flags.

What I understand the low performance, mainly due to the CPU can't take DMA device performance,
so the CPU is a bottleneck, when we reduce the tasks of the CPU (just like this commit did), then
the performance is improved.

This commit can test the maximum performance when the CPU and DMA cowork together, so I think we can
add this commit.

pktmbuf is a popular programming entity, and almost all application (including examples) in the DPDK
community are based on pktmbuf.

I think that keeping the use of pktbuf provides a flexibility, someone who want do more operates with
pktmbuf (maybe emulate the real logic) could be easily modified and testing.

Thanks

> 
> 
>>
>> Thanks.
>>
>>> +                                     buf_size,
>>> +                                     0);
>>>                        if (unlikely(ret < 0)) {
>>>                                if (ret == -ENOSPC) {
>>>                                        do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
>>> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>>>        volatile struct worker_info *worker_info = &(para->worker_info);
>>>        const uint32_t nr_buf = para->nr_buf;
>>>        const uint32_t buf_size = para->buf_size;
>>> -     struct rte_mbuf **srcs = para->srcs;
>>> -     struct rte_mbuf **dsts = para->dsts;
>>> +     void **srcs = para->srcs;
>>> +     void **dsts = para->dsts;
>>>        uint32_t i;
>>>
>>>        worker_info->stop_flag = false;
>>> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>>>
>>>        while (1) {
>>>                for (i = 0; i < nr_buf; i++) {
>>> -                     const void *src = rte_pktmbuf_mtod(dsts[i], void *);
>>> -                     void *dst = rte_pktmbuf_mtod(srcs[i], void *);
>>> +                     const void *src = (void *) dsts[i];
>>> +                     void *dst = (void *) srcs[i];
>>>
>>>                        /* copy buffer form src to dst */
>>>                        rte_memcpy(dst, src, (size_t)buf_size);
>>> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>>>   }
>>>
>>>   static int
>>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>> -                     struct rte_mbuf ***dsts)
>>> +setup_memory_env(struct test_configure *cfg, void ***srcs,
>>> +                     void ***dsts)
>>>   {
>>>        unsigned int buf_size = cfg->buf_size.cur;
>>>        unsigned int nr_sockets;
>>> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>>                return -1;
>>>        }
>>>
>>> -     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
>>> +     src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>>>                        nr_buf,
>>> +                     buf_size,
>>>                        0,
>>>                        0,
>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>> -                     cfg->src_numa_node);
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     cfg->src_numa_node,
>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>        if (src_pool == NULL) {
>>>                PRINT_ERR("Error with source mempool creation.\n");
>>>                return -1;
>>>        }
>>>
>>> -     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
>>> +     dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>>>                        nr_buf,
>>> +                     buf_size,
>>>                        0,
>>>                        0,
>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>> -                     cfg->dst_numa_node);
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     cfg->dst_numa_node,
>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>        if (dst_pool == NULL) {
>>>                PRINT_ERR("Error with destination mempool creation.\n");
>>>                return -1;
>>>        }
>>>
>>> -     *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>> +     *srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>>>        if (*srcs == NULL) {
>>>                printf("Error: srcs malloc failed.\n");
>>>                return -1;
>>>        }
>>>
>>> -     *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>> +     *dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>>>        if (*dsts == NULL) {
>>>                printf("Error: dsts malloc failed.\n");
>>>                return -1;
>>>        }
>>>
>>> -     if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
>>> -             printf("alloc src mbufs failed.\n");
>>> +     if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
>>> +             printf("alloc src bufs failed.\n");
>>>                return -1;
>>>        }
>>>
>>> -     if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>> -             printf("alloc dst mbufs failed.\n");
>>> +     if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>> +             printf("alloc dst bufs failed.\n");
>>>                return -1;
>>>        }
>>>
>>> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>        uint16_t i;
>>>        uint32_t offset;
>>>        unsigned int lcore_id = 0;
>>> -     struct rte_mbuf **srcs = NULL, **dsts = NULL;
>>> +     void **srcs = NULL, **dsts = NULL;
>>>        struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>>        unsigned int buf_size = cfg->buf_size.cur;
>>>        uint16_t kick_batch = cfg->kick_batch.cur;
>>> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>   out:
>>>        /* free mbufs used in the test */
>>>        if (srcs != NULL)
>>> -             rte_pktmbuf_free_bulk(srcs, nr_buf);
>>> +             rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>>>        if (dsts != NULL)
>>> -             rte_pktmbuf_free_bulk(dsts, nr_buf);
>>> +             rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>>>
>>>        /* free the points for the mbufs */
>>>        rte_free(srcs);
>>>
> .

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] app/dma-perf: replace pktmbuf with mempool objects
  2024-02-27 12:27       ` fengchengwen
@ 2024-02-28  3:08         ` Varghese, Vipin
  0 siblings, 0 replies; 16+ messages in thread
From: Varghese, Vipin @ 2024-02-28  3:08 UTC (permalink / raw)
  To: fengchengwen, dev, david.marchand, honest.jiang
  Cc: Morten Brørup, Thiyagrajan P, Ferruh Yigit


On 2/27/2024 5:57 PM, fengchengwen wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Hi Vipin,
>
> On 2024/2/27 17:57, Varghese, Vipin wrote:
>> On 2/26/2024 7:35 AM, fengchengwen wrote:
>>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>>
>>>
>>> Hi Vipin,
>>>
>>> On 2023/12/20 19:03, Vipin Varghese wrote:
>>>> From: Vipin Varghese <Vipin.Varghese@amd.com>
>>>>
>>>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>>>> especially in lower buffer size. Using Mempool, allows to reduce
>>>> the extra CPU cycles.
>>>>
>>>> Changes made are
>>>> 1. pktmbuf pool create with mempool create.
>>>> 2. create src & dst pointer array from the appropaite numa.
>>>> 3. use get pool and put for mempool objects.
>>>> 4. remove pktmbuf_mtod for dma and cpu memcpy.
>>>>
>>>> v2 changes:
>>>>    - add ACK from  Morten Brørup
>>>>
>>>> v1 changes:
>>>>    - pktmbuf pool create with mempool create.
>>>>    - create src & dst pointer array from the appropaite numa.
>>>>    - use get pool and put for mempool objects.
>>>>    - remove pktmbuf_mtod for dma and cpu memcpy.
>>>>
>>>> Test Results for pktmbuf vs mempool:
>>>> ====================================
>>>>
>>>> Format: Buffer Size | % AVG cycles | % AVG Gbps
>>>>
>>>> Category-1: HW-DSA
>>>> -------------------
>>>>     64|-13.11| 14.97
>>>>    128|-41.49|  0.41
>>>>    256| -1.85|  1.20
>>>>    512| -9.38|  8.81
>>>> 1024|  1.82| -2.00
>>>> 1518|  0.00| -0.80
>>>> 2048|  1.03| -0.91
>>>> 4096|  0.00| -0.35
>>>> 8192|  0.07| -0.08
>>>>
>>>> Category-2: MEMCPY
>>>> -------------------
>>>>     64|-12.50|14.14
>>>>    128|-40.63|67.26
>>>>    256|-38.78|59.35
>>>>    512|-30.26|43.36
>>>> 1024|-21.80|27.04
>>>> 1518|-16.23|19.33
>>>> 2048|-14.75|16.81
>>>> 4096| -9.56|10.01
>>>> 8192| -3.32| 3.12
>>>>
>>>> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>>>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>>>> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
>>>> ---
>>>> ---
>>>>    app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>>>>    1 file changed, 44 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
>>>> index 9b1f58c78c..dc6f16cc01 100644
>>>> --- a/app/test-dma-perf/benchmark.c
>>>> +++ b/app/test-dma-perf/benchmark.c
>>>> @@ -43,8 +43,8 @@ struct lcore_params {
>>>>         uint16_t kick_batch;
>>>>         uint32_t buf_size;
>>>>         uint16_t test_secs;
>>>> -     struct rte_mbuf **srcs;
>>>> -     struct rte_mbuf **dsts;
>>>> +     void **srcs;
>>>> +     void **dsts;
>>>>         volatile struct worker_info worker_info;
>>>>    };
>>>>
>>>> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>>>>    }
>>>>
>>>>    static inline void
>>>> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
>>>> +cache_flush_buf(__rte_unused void **array,
>>>>                 __rte_unused uint32_t buf_size,
>>>>                 __rte_unused uint32_t nr_buf)
>>>>    {
>>>>    #ifdef RTE_ARCH_X86_64
>>>>         char *data;
>>>> -     struct rte_mbuf **srcs = array;
>>>> +     void **srcs = array;
>>>>         uint32_t i, offset;
>>>>
>>>>         for (i = 0; i < nr_buf; i++) {
>>>> -             data = rte_pktmbuf_mtod(srcs[i], char *);
>>>> +             data = (char *) srcs[i];
>>>>                 for (offset = 0; offset < buf_size; offset += 64)
>>>>                         __builtin_ia32_clflush(data + offset);
>>>>         }
>>>> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>>>>         const uint32_t nr_buf = para->nr_buf;
>>>>         const uint16_t kick_batch = para->kick_batch;
>>>>         const uint32_t buf_size = para->buf_size;
>>>> -     struct rte_mbuf **srcs = para->srcs;
>>>> -     struct rte_mbuf **dsts = para->dsts;
>>>> +     void **srcs = para->srcs;
>>>> +     void **dsts = para->dsts;
>>>>         uint16_t nr_cpl;
>>>>         uint64_t async_cnt = 0;
>>>>         uint32_t i;
>>>> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>>>>         while (1) {
>>>>                 for (i = 0; i < nr_buf; i++) {
>>>>    dma_copy:
>>>> -                     ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
>>>> -                             rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>>>> +                     ret = rte_dma_copy(dev_id,
>>>> +                                     0,
>>>> +                                     (rte_iova_t) srcs[i],
>>>> +                                     (rte_iova_t) dsts[i],
>> Thank you ChengWen for the suggestion, please find my observations below
>>> should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
>>> but this commit is mainly to eliminate the address convert overload, so we
>>> should prepare IOVA for DMA copy, and VA for memory copy.
>> yes, Ferruh helped me to understand. Please let me look into this and share a v3 soon.
>>
>>> I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
>>> then direct use them in do_xxx_mem_copy。
>> Please help me understand if you are suggesting, in function `setup_memory_env` we still keep pkt_mbuf creation.
>>
>> But when the arrays are created instead of populating them with mbuf, we directly call `pktmbuf_mtod` and store the
>>
>> starting address. Thus in cpu-copy or dma-copy we do not spent time in compute. Is this what you mean?
> Yes
>
>>
>> My reasoning for not using pktmbuf is as follows
>>
>> 1. pkt_mbuf has rte_mbuf metadata + private + headroom + tailroom
>>
>> 2. so when create payload for 2K, 4K, 8K, 16K, 32K, 1GB we are accounting for extra headroom. which is not efficent
>>
>> 3. dma-perf is targeted for performance and not network function.
>>
>> 4. there is an existing example which makes use pktmbuf and dma calls.
>>
>>
>> hence I would like to use mempool which also helps per numa with flags.
> What I understand the low performance, mainly due to the CPU can't take DMA device performance,
> so the CPU is a bottleneck, when we reduce the tasks of the CPU (just like this commit did), then
> the performance is improved.
>
> This commit can test the maximum performance when the CPU and DMA cowork together, so I think we can
> add this commit.
>
> pktmbuf is a popular programming entity, and almost all application (including examples) in the DPDK
> community are based on pktmbuf.
>
> I think that keeping the use of pktbuf provides a flexibility, someone who want do more operates with
> pktmbuf (maybe emulate the real logic) could be easily modified and testing.

thank you Chengwen for the comments, I have also noticed some merges on 
`benchmark.c`. Let me take that as new baseline and modify as v3 as new 
option for mempool.

>
> Thanks
>
>>
>>> Thanks.
>>>
>>>> +                                     buf_size,
>>>> +                                     0);
>>>>                         if (unlikely(ret < 0)) {
>>>>                                 if (ret == -ENOSPC) {
>>>>                                         do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
>>>> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>>>>         volatile struct worker_info *worker_info = &(para->worker_info);
>>>>         const uint32_t nr_buf = para->nr_buf;
>>>>         const uint32_t buf_size = para->buf_size;
>>>> -     struct rte_mbuf **srcs = para->srcs;
>>>> -     struct rte_mbuf **dsts = para->dsts;
>>>> +     void **srcs = para->srcs;
>>>> +     void **dsts = para->dsts;
>>>>         uint32_t i;
>>>>
>>>>         worker_info->stop_flag = false;
>>>> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>>>>
>>>>         while (1) {
>>>>                 for (i = 0; i < nr_buf; i++) {
>>>> -                     const void *src = rte_pktmbuf_mtod(dsts[i], void *);
>>>> -                     void *dst = rte_pktmbuf_mtod(srcs[i], void *);
>>>> +                     const void *src = (void *) dsts[i];
>>>> +                     void *dst = (void *) srcs[i];
>>>>
>>>>                         /* copy buffer form src to dst */
>>>>                         rte_memcpy(dst, src, (size_t)buf_size);
>>>> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>>>>    }
>>>>
>>>>    static int
>>>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>>> -                     struct rte_mbuf ***dsts)
>>>> +setup_memory_env(struct test_configure *cfg, void ***srcs,
>>>> +                     void ***dsts)
>>>>    {
>>>>         unsigned int buf_size = cfg->buf_size.cur;
>>>>         unsigned int nr_sockets;
>>>> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
>>>> +     src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>>>>                         nr_buf,
>>>> +                     buf_size,
>>>>                         0,
>>>>                         0,
>>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>>> -                     cfg->src_numa_node);
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     cfg->src_numa_node,
>>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>>         if (src_pool == NULL) {
>>>>                 PRINT_ERR("Error with source mempool creation.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
>>>> +     dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>>>>                         nr_buf,
>>>> +                     buf_size,
>>>>                         0,
>>>>                         0,
>>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>>> -                     cfg->dst_numa_node);
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     cfg->dst_numa_node,
>>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>>         if (dst_pool == NULL) {
>>>>                 PRINT_ERR("Error with destination mempool creation.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>>> +     *srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>>>>         if (*srcs == NULL) {
>>>>                 printf("Error: srcs malloc failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>>> +     *dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>>>>         if (*dsts == NULL) {
>>>>                 printf("Error: dsts malloc failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
>>>> -             printf("alloc src mbufs failed.\n");
>>>> +     if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
>>>> +             printf("alloc src bufs failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>>> -             printf("alloc dst mbufs failed.\n");
>>>> +     if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>>> +             printf("alloc dst bufs failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>>         uint16_t i;
>>>>         uint32_t offset;
>>>>         unsigned int lcore_id = 0;
>>>> -     struct rte_mbuf **srcs = NULL, **dsts = NULL;
>>>> +     void **srcs = NULL, **dsts = NULL;
>>>>         struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>>>         unsigned int buf_size = cfg->buf_size.cur;
>>>>         uint16_t kick_batch = cfg->kick_batch.cur;
>>>> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>>    out:
>>>>         /* free mbufs used in the test */
>>>>         if (srcs != NULL)
>>>> -             rte_pktmbuf_free_bulk(srcs, nr_buf);
>>>> +             rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>>>>         if (dsts != NULL)
>>>> -             rte_pktmbuf_free_bulk(dsts, nr_buf);
>>>> +             rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>>>>
>>>>         /* free the points for the mbufs */
>>>>         rte_free(srcs);
>>>>
>> .

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] app/dma-perf: replace pktmbuf with mempool objects
@ 2023-12-19 16:35 Vipin Varghese
  0 siblings, 0 replies; 16+ messages in thread
From: Vipin Varghese @ 2023-12-19 16:35 UTC (permalink / raw)
  To: dev, stable, honest.jiang, gmuthukrishn, ferruh.yigit

Replace pktmbuf pool with mempool, this allows increase in MOPS
especially in lower buffer size. Using Mempool, allows to reduce
the extra CPU cycles.

Changes made are
1. pktmbuf pool create with mempool create.
2. create src & dst pointer array from the appropaite numa.
3. use get pool and put for mempool objects.
4. remove pktmbuf_mtod for dma and cpu memcpy.

Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
---
 app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..dc6f16cc01 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -43,8 +43,8 @@ struct lcore_params {
 	uint16_t kick_batch;
 	uint32_t buf_size;
 	uint16_t test_secs;
-	struct rte_mbuf **srcs;
-	struct rte_mbuf **dsts;
+	void **srcs;
+	void **dsts;
 	volatile struct worker_info worker_info;
 };
 
@@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
 }
 
 static inline void
-cache_flush_buf(__rte_unused struct rte_mbuf **array,
+cache_flush_buf(__rte_unused void **array,
 		__rte_unused uint32_t buf_size,
 		__rte_unused uint32_t nr_buf)
 {
 #ifdef RTE_ARCH_X86_64
 	char *data;
-	struct rte_mbuf **srcs = array;
+	void **srcs = array;
 	uint32_t i, offset;
 
 	for (i = 0; i < nr_buf; i++) {
-		data = rte_pktmbuf_mtod(srcs[i], char *);
+		data = (char *) srcs[i];
 		for (offset = 0; offset < buf_size; offset += 64)
 			__builtin_ia32_clflush(data + offset);
 	}
@@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
 	const uint32_t nr_buf = para->nr_buf;
 	const uint16_t kick_batch = para->kick_batch;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint16_t nr_cpl;
 	uint64_t async_cnt = 0;
 	uint32_t i;
@@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
 dma_copy:
-			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
-				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			ret = rte_dma_copy(dev_id,
+					0,
+					(rte_iova_t) srcs[i],
+					(rte_iova_t) dsts[i],
+					buf_size,
+					0);
 			if (unlikely(ret < 0)) {
 				if (ret == -ENOSPC) {
 					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
@@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
 	volatile struct worker_info *worker_info = &(para->worker_info);
 	const uint32_t nr_buf = para->nr_buf;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint32_t i;
 
 	worker_info->stop_flag = false;
@@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
 
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
-			const void *src = rte_pktmbuf_mtod(dsts[i], void *);
-			void *dst = rte_pktmbuf_mtod(srcs[i], void *);
+			const void *src = (void *) dsts[i];
+			void *dst = (void *) srcs[i];
 
 			/* copy buffer form src to dst */
 			rte_memcpy(dst, src, (size_t)buf_size);
@@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg, void ***srcs,
+			void ***dsts)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
@@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
-	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+	src_pool = rte_mempool_create("Benchmark_DMA_SRC",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->src_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->src_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
 		return -1;
 	}
 
-	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+	dst_pool = rte_mempool_create("Benchmark_DMA_DST",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->dst_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->dst_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
 		return -1;
 	}
 
-	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
 	if (*srcs == NULL) {
 		printf("Error: srcs malloc failed.\n");
 		return -1;
 	}
 
-	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
 	if (*dsts == NULL) {
 		printf("Error: dsts malloc failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
-		printf("alloc src mbufs failed.\n");
+	if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src bufs failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
-		printf("alloc dst mbufs failed.\n");
+	if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst bufs failed.\n");
 		return -1;
 	}
 
@@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint16_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	void **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 out:
 	/* free mbufs used in the test */
 	if (srcs != NULL)
-		rte_pktmbuf_free_bulk(srcs, nr_buf);
+		rte_mempool_put_bulk(src_pool, srcs, nr_buf);
 	if (dsts != NULL)
-		rte_pktmbuf_free_bulk(dsts, nr_buf);
+		rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
 
 	/* free the points for the mbufs */
 	rte_free(srcs);
-- 
2.34.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-02-28  3:08 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-12 10:37 [PATCH] app/dma-perf: replace pktmbuf with mempool objects Vipin Varghese
2023-12-12 11:40 ` Morten Brørup
2023-12-12 14:38   ` Ferruh Yigit
2023-12-12 15:16     ` Morten Brørup
2023-12-12 15:37       ` Bruce Richardson
2023-12-12 17:13         ` Varghese, Vipin
2023-12-12 18:09           ` Morten Brørup
2023-12-12 18:13             ` Varghese, Vipin
2023-12-20  9:17               ` Varghese, Vipin
2023-12-20  9:21                 ` David Marchand
2023-12-20 11:03 ` [PATCH v2] " Vipin Varghese
2024-02-26  2:05   ` fengchengwen
2024-02-27  9:57     ` Varghese, Vipin
2024-02-27 12:27       ` fengchengwen
2024-02-28  3:08         ` Varghese, Vipin
2023-12-19 16:35 [PATCH] " Vipin Varghese

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).