DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] app/testpmd: improve sse based macswap
@ 2024-07-13 15:19 Vipin Varghese
  2024-07-15 15:07 ` Bruce Richardson
  0 siblings, 1 reply; 7+ messages in thread
From: Vipin Varghese @ 2024-07-13 15:19 UTC (permalink / raw)
  To: dev, ferruh.yigit, bruce.richardson, konstantin.v.ananyev,
	vipin.varghese

Goal of the patch is to improve SSE macswap on x86_64 by reducing
the stalls in backend engine. Original implementation of the SSE
macswap makes loop call to multiple load, shuffle & store. Using
SIMD ISA interleaving we can reduce the stalls for
 - load SSE token exhaustion
 - Shuffle and Load dependency

Also other changes which improves packet per second are
 - Filling access to MBUF for offload flags which is separate cacheline,
 - using register keyword

Test results:
------------
Platform: AMD EPYC SIENA 8594P @2.3GHz, no boost
DPDK: 24.03

------------------------------------------------
TEST IO 64B: baseline <NIC : MPPs>
 - mellanox CX-7 2*200Gbps : 42.0
 - intel E810 1*100Gbps : 82.0
 - intel E810 2*200Gbps (2CQ-DA2): 83.0
------------------------------------------------
TEST MACSWAP 64B: <NIC : Before : After>
 - mellanox CX-7 2*200Gbps : 31.533 : 31.90
 - intel E810 1*100Gbps : 50.380 : 47.0
 - intel E810 2*200Gbps (2CQ-DA2): 48.840 : 49.827
------------------------------------------------
TEST MACSWAP 128B: <NIC : Before: After>
 - mellanox CX-7 2*200Gbps: 30.946 : 31.770
 - intel E810 1*100Gbps: 49.386 : 46.366
 - intel E810 2*200Gbps (2CQ-DA2): 47.979 : 49.503
------------------------------------------------
TEST MACSWAP 256B: <NIC: Before: After>
 - mellanox CX-7 2*200Gbps: 32.480 : 33.150
 - intel E810 1 * 100Gbps: 45.29 : 44.571
 - intel E810 2 * 200Gbps (2CQ-DA2): 45.033 : 45.117
------------------------------------------------

using multiple queues and lcore there is linear increase in MPPs.

Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
---
 app/test-pmd/macswap_sse.h | 40 ++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index 223f87a539..a3d3a274e5 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -11,21 +11,21 @@ static inline void
 do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		struct rte_port *txp)
 {
-	struct rte_ether_hdr *eth_hdr[4];
-	struct rte_mbuf *mb[4];
+	register struct rte_ether_hdr *eth_hdr[8];
+	register struct rte_mbuf *mb[8];
 	uint64_t ol_flags;
 	int i;
 	int r;
-	__m128i addr0, addr1, addr2, addr3;
+	register __m128i addr0, addr1, addr2, addr3;
 	/**
 	 * shuffle mask be used to shuffle the 16 bytes.
 	 * byte 0-5 wills be swapped with byte 6-11.
 	 * byte 12-15 will keep unchanged.
 	 */
-	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
-					5, 4, 3, 2,
-					1, 0, 11, 10,
-					9, 8, 7, 6);
+	register const __m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+							5, 4, 3, 2,
+							1, 0, 11, 10,
+							9, 8, 7, 6);
 
 	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
 	vlan_qinq_set(pkts, nb, ol_flags,
@@ -44,23 +44,24 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 
 		mb[0] = pkts[i++];
 		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
-		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
-
 		mb[1] = pkts[i++];
 		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct rte_ether_hdr *);
-		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
-
-
 		mb[2] = pkts[i++];
 		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct rte_ether_hdr *);
-		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
-
 		mb[3] = pkts[i++];
 		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct rte_ether_hdr *);
-		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
 
+		/* Interleave load, shuffle & set */
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+		mbuf_field_set(mb[0], ol_flags);
+		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+		mbuf_field_set(mb[1], ol_flags);
 		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+		mbuf_field_set(mb[2], ol_flags);
 		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
+		mbuf_field_set(mb[3], ol_flags);
 		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
 		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
 
@@ -69,25 +70,22 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
 		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
 
-		mbuf_field_set(mb[0], ol_flags);
-		mbuf_field_set(mb[1], ol_flags);
-		mbuf_field_set(mb[2], ol_flags);
-		mbuf_field_set(mb[3], ol_flags);
 		r -= 4;
 	}
 
 	for ( ; i < nb; i++) {
 		if (i < nb - 1)
 			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+
 		mb[0] = pkts[i];
 		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
 		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+		/* MBUF and Ethernet are 2 separate cacheline */
+		mbuf_field_set(mb[0], ol_flags);
 		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
 		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
-
-		mbuf_field_set(mb[0], ol_flags);
 	}
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] app/testpmd: improve sse based macswap
  2024-07-13 15:19 [PATCH] app/testpmd: improve sse based macswap Vipin Varghese
@ 2024-07-15 15:07 ` Bruce Richardson
  0 siblings, 0 replies; 7+ messages in thread
From: Bruce Richardson @ 2024-07-15 15:07 UTC (permalink / raw)
  To: Vipin Varghese; +Cc: dev, ferruh.yigit, konstantin.v.ananyev

On Sat, Jul 13, 2024 at 08:49:49PM +0530, Vipin Varghese wrote:
> Goal of the patch is to improve SSE macswap on x86_64 by reducing
> the stalls in backend engine. Original implementation of the SSE
> macswap makes loop call to multiple load, shuffle & store. Using
> SIMD ISA interleaving we can reduce the stalls for
>  - load SSE token exhaustion
>  - Shuffle and Load dependency
> 
> Also other changes which improves packet per second are
>  - Filling access to MBUF for offload flags which is separate cacheline,
>  - using register keyword
> 
> Test results:
> ------------
> Platform: AMD EPYC SIENA 8594P @2.3GHz, no boost
> DPDK: 24.03
> 
> ------------------------------------------------
> TEST IO 64B: baseline <NIC : MPPs>
>  - mellanox CX-7 2*200Gbps : 42.0
>  - intel E810 1*100Gbps : 82.0
>  - intel E810 2*200Gbps (2CQ-DA2): 83.0
> ------------------------------------------------
> TEST MACSWAP 64B: <NIC : Before : After>
>  - mellanox CX-7 2*200Gbps : 31.533 : 31.90
>  - intel E810 1*100Gbps : 50.380 : 47.0
>  - intel E810 2*200Gbps (2CQ-DA2): 48.840 : 49.827
> ------------------------------------------------
> TEST MACSWAP 128B: <NIC : Before: After>
>  - mellanox CX-7 2*200Gbps: 30.946 : 31.770
>  - intel E810 1*100Gbps: 49.386 : 46.366
>  - intel E810 2*200Gbps (2CQ-DA2): 47.979 : 49.503
> ------------------------------------------------
> TEST MACSWAP 256B: <NIC: Before: After>
>  - mellanox CX-7 2*200Gbps: 32.480 : 33.150
>  - intel E810 1 * 100Gbps: 45.29 : 44.571
>  - intel E810 2 * 200Gbps (2CQ-DA2): 45.033 : 45.117
> ------------------------------------------------
> 

Hi,

interesting patch. Do you know why we see regressions in some of the cases
above? For 1x100G at 64B and 128B packet sizes we see perf drops of 3mpps
vs smaller gains in the other two cases at each size (much smaller in the
64B case).

Couple of other questions inline below too.

Thanks,
/Bruce

> using multiple queues and lcore there is linear increase in MPPs.
> 
> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
> ---
>  app/test-pmd/macswap_sse.h | 40 ++++++++++++++++++--------------------
>  1 file changed, 19 insertions(+), 21 deletions(-)
> 
> diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
> index 223f87a539..a3d3a274e5 100644
> --- a/app/test-pmd/macswap_sse.h
> +++ b/app/test-pmd/macswap_sse.h
> @@ -11,21 +11,21 @@ static inline void
>  do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
>  		struct rte_port *txp)
>  {
> -	struct rte_ether_hdr *eth_hdr[4];
> -	struct rte_mbuf *mb[4];
> +	register struct rte_ether_hdr *eth_hdr[8];
> +	register struct rte_mbuf *mb[8];

Does using "register" actually make a difference to the generated code?

Also, why increasing the array sizes from 4 to 8 - the actual code only
uses 4 elements of each array below anyway? Is it for cache alignment
purposes perhaps - if so, please use explicit cache alignment attributes to
specify this rather than having it implicit in the array sizes.

>  	uint64_t ol_flags;
>  	int i;
>  	int r;
> -	__m128i addr0, addr1, addr2, addr3;
> +	register __m128i addr0, addr1, addr2, addr3;
>  	/**
>  	 * shuffle mask be used to shuffle the 16 bytes.
>  	 * byte 0-5 wills be swapped with byte 6-11.
>  	 * byte 12-15 will keep unchanged.
>  	 */
> -	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> -					5, 4, 3, 2,
> -					1, 0, 11, 10,
> -					9, 8, 7, 6);
> +	register const __m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> +							5, 4, 3, 2,
> +							1, 0, 11, 10,
> +							9, 8, 7, 6);
>  
>  	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
>  	vlan_qinq_set(pkts, nb, ol_flags,
> @@ -44,23 +44,24 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
>  
>  		mb[0] = pkts[i++];
>  		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
> -		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
> -
>  		mb[1] = pkts[i++];
>  		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct rte_ether_hdr *);
> -		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
> -
> -
>  		mb[2] = pkts[i++];
>  		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct rte_ether_hdr *);
> -		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
> -
>  		mb[3] = pkts[i++];
>  		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct rte_ether_hdr *);
> -		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
>  
> +		/* Interleave load, shuffle & set */
> +		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
> +		mbuf_field_set(mb[0], ol_flags);
> +		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
> +		mbuf_field_set(mb[1], ol_flags);
>  		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
> +		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
> +		mbuf_field_set(mb[2], ol_flags);
>  		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
> +		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
> +		mbuf_field_set(mb[3], ol_flags);
>  		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
>  		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
>  
> @@ -69,25 +70,22 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
>  		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
>  		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
>  
> -		mbuf_field_set(mb[0], ol_flags);
> -		mbuf_field_set(mb[1], ol_flags);
> -		mbuf_field_set(mb[2], ol_flags);
> -		mbuf_field_set(mb[3], ol_flags);
>  		r -= 4;
>  	}
>  
>  	for ( ; i < nb; i++) {
>  		if (i < nb - 1)
>  			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
> +
>  		mb[0] = pkts[i];
>  		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
>  
>  		/* Swap dest and src mac addresses. */
>  		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
> +		/* MBUF and Ethernet are 2 separate cacheline */
> +		mbuf_field_set(mb[0], ol_flags);
>  		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
>  		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
> -
> -		mbuf_field_set(mb[0], ol_flags);
>  	}

Since final loop is only for the odd elements at the end of the array of
buffers. Does changing the instruction ordering here really make perf
difference?


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] app/testpmd: improve sse based macswap
  2024-07-25 12:47     ` Varghese, Vipin
@ 2024-07-25 12:52       ` Bruce Richardson
  0 siblings, 0 replies; 7+ messages in thread
From: Bruce Richardson @ 2024-07-25 12:52 UTC (permalink / raw)
  To: Varghese, Vipin
  Cc: Ferruh Yigit, dev, Mcnamara, John, Xu, HailinX, konstantin.v.ananyev

On Thu, Jul 25, 2024 at 06:17:35PM +0530, Varghese, Vipin wrote:
>    Hi Bruce,
>    Thanks for highlighting the variance. We found this was an internal
>    test bed configuration issue. We are sharing the next version of the
>    same patch with updated numbers.
> 

Great, thanks for the update.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] app/testpmd: improve sse based macswap
  2024-07-23 17:12   ` Bruce Richardson
@ 2024-07-25 12:47     ` Varghese, Vipin
  2024-07-25 12:52       ` Bruce Richardson
  0 siblings, 1 reply; 7+ messages in thread
From: Varghese, Vipin @ 2024-07-25 12:47 UTC (permalink / raw)
  To: Bruce Richardson, Ferruh Yigit
  Cc: dev, Mcnamara, John, Xu, HailinX, konstantin.v.ananyev

[-- Attachment #1: Type: text/plain, Size: 3591 bytes --]

Hi Bruce,

Thanks for highlighting the variance. We found this was an internal test 
bed configuration issue. We are sharing the next version of the same 
patch with updated numbers.


On 7/23/2024 10:42 PM, Bruce Richardson wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Tue, Jul 23, 2024 at 05:45:57PM +0100, Ferruh Yigit wrote:
>> On 7/16/2024 7:37 AM, Vipin Varghese wrote:
>>> Goal of the patch is to improve SSE macswap on x86_64 by reducing
>>> the stalls in backend engine. Original implementation of the SSE
>>> macswap makes loop call to multiple load, shuffle & store. Using
>>> SIMD ISA interleaving we can reduce the stalls for
>>>   - load SSE token exhaustion
>>>   - Shuffle and Load dependency
>>>
>>> Also other changes which improves packet per second are
>>>   - Filling access to MBUF for offload flags which is separate cacheline,
>>>   - using register keyword
>>>
>>> Build test using meson script:
>>> ``````````````````````````````
>>>
>>> build-gcc-static
>>> buildtools
>>> build-gcc-shared
>>> build-mini
>>> build-clang-static
>>> build-clang-shared
>>> build-x86-generic
>>>
>>> Test Results:
>>> `````````````
>>>
>>> Platform-1: AMD EPYC SIENA 8594P @2.3GHz, no boost
>>>
>>> ------------------------------------------------
>>> TEST IO 64B: baseline <NIC : MPPs>
>>>   - mellanox CX-7 2*200Gbps : 42.0
>>>   - intel E810 1*100Gbps : 82.0
>>>   - intel E810 2*200Gbps (2CQ-DA2): 82.45
>>> ------------------------------------------------
>>> TEST MACSWAP 64B: <NIC : Before : After>
>>>   - mellanox CX-7 2*200Gbps : 31.533 : 31.90
>>>   - intel E810 1*100Gbps : 50.380 : 47.0
>>>   - intel E810 2*200Gbps (2CQ-DA2): 48.840 : 49.827
>>> ------------------------------------------------
>>> TEST MACSWAP 128B: <NIC : Before: After>
>>>   - mellanox CX-7 2*200Gbps: 30.946 : 31.770
>>>   - intel E810 1*100Gbps: 49.386 : 46.366
>>>   - intel E810 2*200Gbps (2CQ-DA2): 47.979 : 49.503
>>> ------------------------------------------------
>>> TEST MACSWAP 256B: <NIC: Before: After>
>>>   - mellanox CX-7 2*200Gbps: 32.480 : 33.150
>>>   - intel E810 1 * 100Gbps: 45.29 : 44.571
>>>   - intel E810 2 * 200Gbps (2CQ-DA2): 45.033 : 45.117
>>> ------------------------------------------------
>>>
>>> Platform-2: AMD EPYC 9554 @3.1GHz, no boost
>>>
>>> ------------------------------------------------
>>> TEST IO 64B: baseline <NIC : MPPs>
>>>   - intel E810 2*200Gbps (2CQ-DA2): 82.49
>>> ------------------------------------------------
>>> <NIC intel E810 2*200Gbps (2CQ-DA2): Before : After>
>>> TEST MACSWAP: 1Q 1C1T
>>>   64B: : 45.0 : 45.54
>>> 128B: : 44.48 : 44.43
>>> 256B: : 42.0 : 41.99
>>> +++++++++++++++++++++++++
>>> TEST MACSWAP: 2Q 2C2T
>>>   64B: : 59.5 : 60.55
>>> 128B: : 56.78 : 58.1
>>> 256B: : 41.85 : 41.99
>>> ------------------------------------------------
>>>
>>> Signed-off-by: Vipin Varghese<vipin.varghese@amd.com>
>>>
>> Hi Bruce, John,
>>
>> Can you please help testing macswap performance with this patch on Intel
>> platforms, to be sure it is not causing regression?
>>
> Hi Ferruh,
>
> We can try and get some Intel numbers for you, but I think at this point it
> is better deferred to 24.11 due to lack of discussion and analysis of the
> numbers. This is because the numbers above already show that it is causing
> regressions - in fact many of the regressions are larger than the benefits
> shown. This may be acceptable, but it would imply that we shouldn't be too
> hasty in applying the patch.
>
> Regards,
> /Bruce

[-- Attachment #2: Type: text/html, Size: 4183 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] app/testpmd: improve sse based macswap
  2024-07-23 16:45 ` Ferruh Yigit
@ 2024-07-23 17:12   ` Bruce Richardson
  2024-07-25 12:47     ` Varghese, Vipin
  0 siblings, 1 reply; 7+ messages in thread
From: Bruce Richardson @ 2024-07-23 17:12 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Vipin Varghese, dev, Mcnamara, John, Xu, HailinX, konstantin.v.ananyev

On Tue, Jul 23, 2024 at 05:45:57PM +0100, Ferruh Yigit wrote:
> On 7/16/2024 7:37 AM, Vipin Varghese wrote:
> > Goal of the patch is to improve SSE macswap on x86_64 by reducing
> > the stalls in backend engine. Original implementation of the SSE
> > macswap makes loop call to multiple load, shuffle & store. Using
> > SIMD ISA interleaving we can reduce the stalls for
> >  - load SSE token exhaustion
> >  - Shuffle and Load dependency
> > 
> > Also other changes which improves packet per second are
> >  - Filling access to MBUF for offload flags which is separate cacheline,
> >  - using register keyword
> > 
> > Build test using meson script:
> > ``````````````````````````````
> > 
> > build-gcc-static
> > buildtools
> > build-gcc-shared
> > build-mini
> > build-clang-static
> > build-clang-shared
> > build-x86-generic
> > 
> > Test Results:
> > `````````````
> > 
> > Platform-1: AMD EPYC SIENA 8594P @2.3GHz, no boost
> > 
> > ------------------------------------------------
> > TEST IO 64B: baseline <NIC : MPPs>
> >  - mellanox CX-7 2*200Gbps : 42.0
> >  - intel E810 1*100Gbps : 82.0
> >  - intel E810 2*200Gbps (2CQ-DA2): 82.45
> > ------------------------------------------------
> > TEST MACSWAP 64B: <NIC : Before : After>
> >  - mellanox CX-7 2*200Gbps : 31.533 : 31.90
> >  - intel E810 1*100Gbps : 50.380 : 47.0
> >  - intel E810 2*200Gbps (2CQ-DA2): 48.840 : 49.827
> > ------------------------------------------------
> > TEST MACSWAP 128B: <NIC : Before: After>
> >  - mellanox CX-7 2*200Gbps: 30.946 : 31.770
> >  - intel E810 1*100Gbps: 49.386 : 46.366
> >  - intel E810 2*200Gbps (2CQ-DA2): 47.979 : 49.503
> > ------------------------------------------------
> > TEST MACSWAP 256B: <NIC: Before: After>
> >  - mellanox CX-7 2*200Gbps: 32.480 : 33.150
> >  - intel E810 1 * 100Gbps: 45.29 : 44.571
> >  - intel E810 2 * 200Gbps (2CQ-DA2): 45.033 : 45.117
> > ------------------------------------------------
> > 
> > Platform-2: AMD EPYC 9554 @3.1GHz, no boost
> > 
> > ------------------------------------------------
> > TEST IO 64B: baseline <NIC : MPPs>
> >  - intel E810 2*200Gbps (2CQ-DA2): 82.49
> > ------------------------------------------------
> > <NIC intel E810 2*200Gbps (2CQ-DA2): Before : After>
> > TEST MACSWAP: 1Q 1C1T
> >  64B: : 45.0 : 45.54
> > 128B: : 44.48 : 44.43
> > 256B: : 42.0 : 41.99
> > +++++++++++++++++++++++++
> > TEST MACSWAP: 2Q 2C2T
> >  64B: : 59.5 : 60.55
> > 128B: : 56.78 : 58.1
> > 256B: : 41.85 : 41.99
> > ------------------------------------------------
> > 
> > Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
> >
> 
> Hi Bruce, John,
> 
> Can you please help testing macswap performance with this patch on Intel
> platforms, to be sure it is not causing regression?
> 
Hi Ferruh,

We can try and get some Intel numbers for you, but I think at this point it
is better deferred to 24.11 due to lack of discussion and analysis of the
numbers. This is because the numbers above already show that it is causing
regressions - in fact many of the regressions are larger than the benefits
shown. This may be acceptable, but it would imply that we shouldn't be too
hasty in applying the patch.

Regards,
/Bruce

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] app/testpmd: improve sse based macswap
  2024-07-16  6:37 Vipin Varghese
@ 2024-07-23 16:45 ` Ferruh Yigit
  2024-07-23 17:12   ` Bruce Richardson
  0 siblings, 1 reply; 7+ messages in thread
From: Ferruh Yigit @ 2024-07-23 16:45 UTC (permalink / raw)
  To: Vipin Varghese, dev, bruce.richardson, Mcnamara, John, Xu, HailinX
  Cc: konstantin.v.ananyev

On 7/16/2024 7:37 AM, Vipin Varghese wrote:
> Goal of the patch is to improve SSE macswap on x86_64 by reducing
> the stalls in backend engine. Original implementation of the SSE
> macswap makes loop call to multiple load, shuffle & store. Using
> SIMD ISA interleaving we can reduce the stalls for
>  - load SSE token exhaustion
>  - Shuffle and Load dependency
> 
> Also other changes which improves packet per second are
>  - Filling access to MBUF for offload flags which is separate cacheline,
>  - using register keyword
> 
> Build test using meson script:
> ``````````````````````````````
> 
> build-gcc-static
> buildtools
> build-gcc-shared
> build-mini
> build-clang-static
> build-clang-shared
> build-x86-generic
> 
> Test Results:
> `````````````
> 
> Platform-1: AMD EPYC SIENA 8594P @2.3GHz, no boost
> 
> ------------------------------------------------
> TEST IO 64B: baseline <NIC : MPPs>
>  - mellanox CX-7 2*200Gbps : 42.0
>  - intel E810 1*100Gbps : 82.0
>  - intel E810 2*200Gbps (2CQ-DA2): 82.45
> ------------------------------------------------
> TEST MACSWAP 64B: <NIC : Before : After>
>  - mellanox CX-7 2*200Gbps : 31.533 : 31.90
>  - intel E810 1*100Gbps : 50.380 : 47.0
>  - intel E810 2*200Gbps (2CQ-DA2): 48.840 : 49.827
> ------------------------------------------------
> TEST MACSWAP 128B: <NIC : Before: After>
>  - mellanox CX-7 2*200Gbps: 30.946 : 31.770
>  - intel E810 1*100Gbps: 49.386 : 46.366
>  - intel E810 2*200Gbps (2CQ-DA2): 47.979 : 49.503
> ------------------------------------------------
> TEST MACSWAP 256B: <NIC: Before: After>
>  - mellanox CX-7 2*200Gbps: 32.480 : 33.150
>  - intel E810 1 * 100Gbps: 45.29 : 44.571
>  - intel E810 2 * 200Gbps (2CQ-DA2): 45.033 : 45.117
> ------------------------------------------------
> 
> Platform-2: AMD EPYC 9554 @3.1GHz, no boost
> 
> ------------------------------------------------
> TEST IO 64B: baseline <NIC : MPPs>
>  - intel E810 2*200Gbps (2CQ-DA2): 82.49
> ------------------------------------------------
> <NIC intel E810 2*200Gbps (2CQ-DA2): Before : After>
> TEST MACSWAP: 1Q 1C1T
>  64B: : 45.0 : 45.54
> 128B: : 44.48 : 44.43
> 256B: : 42.0 : 41.99
> +++++++++++++++++++++++++
> TEST MACSWAP: 2Q 2C2T
>  64B: : 59.5 : 60.55
> 128B: : 56.78 : 58.1
> 256B: : 41.85 : 41.99
> ------------------------------------------------
> 
> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>

Hi Bruce, John,

Can you please help testing macswap performance with this patch on Intel
platforms, to be sure it is not causing regression?

Other option is to get this patch for -rc3 and tested there, with the
condition to remove it in any regression, if this help testing the patch?

Thanks,
ferruh


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] app/testpmd: improve sse based macswap
@ 2024-07-16  6:37 Vipin Varghese
  2024-07-23 16:45 ` Ferruh Yigit
  0 siblings, 1 reply; 7+ messages in thread
From: Vipin Varghese @ 2024-07-16  6:37 UTC (permalink / raw)
  To: dev, ferruh.yigit, bruce.richardson, konstantin.v.ananyev,
	vipin.varghese

Goal of the patch is to improve SSE macswap on x86_64 by reducing
the stalls in backend engine. Original implementation of the SSE
macswap makes loop call to multiple load, shuffle & store. Using
SIMD ISA interleaving we can reduce the stalls for
 - load SSE token exhaustion
 - Shuffle and Load dependency

Also other changes which improves packet per second are
 - Filling access to MBUF for offload flags which is separate cacheline,
 - using register keyword

Build test using meson script:
``````````````````````````````

build-gcc-static
buildtools
build-gcc-shared
build-mini
build-clang-static
build-clang-shared
build-x86-generic

Test Results:
`````````````

Platform-1: AMD EPYC SIENA 8594P @2.3GHz, no boost

------------------------------------------------
TEST IO 64B: baseline <NIC : MPPs>
 - mellanox CX-7 2*200Gbps : 42.0
 - intel E810 1*100Gbps : 82.0
 - intel E810 2*200Gbps (2CQ-DA2): 82.45
------------------------------------------------
TEST MACSWAP 64B: <NIC : Before : After>
 - mellanox CX-7 2*200Gbps : 31.533 : 31.90
 - intel E810 1*100Gbps : 50.380 : 47.0
 - intel E810 2*200Gbps (2CQ-DA2): 48.840 : 49.827
------------------------------------------------
TEST MACSWAP 128B: <NIC : Before: After>
 - mellanox CX-7 2*200Gbps: 30.946 : 31.770
 - intel E810 1*100Gbps: 49.386 : 46.366
 - intel E810 2*200Gbps (2CQ-DA2): 47.979 : 49.503
------------------------------------------------
TEST MACSWAP 256B: <NIC: Before: After>
 - mellanox CX-7 2*200Gbps: 32.480 : 33.150
 - intel E810 1 * 100Gbps: 45.29 : 44.571
 - intel E810 2 * 200Gbps (2CQ-DA2): 45.033 : 45.117
------------------------------------------------

Platform-2: AMD EPYC 9554 @3.1GHz, no boost

------------------------------------------------
TEST IO 64B: baseline <NIC : MPPs>
 - intel E810 2*200Gbps (2CQ-DA2): 82.49
------------------------------------------------
<NIC intel E810 2*200Gbps (2CQ-DA2): Before : After>
TEST MACSWAP: 1Q 1C1T
 64B: : 45.0 : 45.54
128B: : 44.48 : 44.43
256B: : 42.0 : 41.99
+++++++++++++++++++++++++
TEST MACSWAP: 2Q 2C2T
 64B: : 59.5 : 60.55
128B: : 56.78 : 58.1
256B: : 41.85 : 41.99
------------------------------------------------

Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
---
 app/test-pmd/macswap_sse.h | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index 223f87a539..6e4ed21924 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -16,16 +16,16 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 	uint64_t ol_flags;
 	int i;
 	int r;
-	__m128i addr0, addr1, addr2, addr3;
+	register __m128i addr0, addr1, addr2, addr3;
 	/**
 	 * shuffle mask be used to shuffle the 16 bytes.
 	 * byte 0-5 wills be swapped with byte 6-11.
 	 * byte 12-15 will keep unchanged.
 	 */
-	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
-					5, 4, 3, 2,
-					1, 0, 11, 10,
-					9, 8, 7, 6);
+	register const __m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+							5, 4, 3, 2,
+							1, 0, 11, 10,
+							9, 8, 7, 6);
 
 	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
 	vlan_qinq_set(pkts, nb, ol_flags,
@@ -44,23 +44,24 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 
 		mb[0] = pkts[i++];
 		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
-		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
-
 		mb[1] = pkts[i++];
 		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct rte_ether_hdr *);
-		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
-
-
 		mb[2] = pkts[i++];
 		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct rte_ether_hdr *);
-		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
-
 		mb[3] = pkts[i++];
 		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct rte_ether_hdr *);
-		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
 
+		/* Interleave loads and shuffle with field set */
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+		mbuf_field_set(mb[0], ol_flags);
+		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+		mbuf_field_set(mb[1], ol_flags);
 		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+		mbuf_field_set(mb[2], ol_flags);
 		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
+		mbuf_field_set(mb[3], ol_flags);
 		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
 		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
 
@@ -69,25 +70,21 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
 		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
 
-		mbuf_field_set(mb[0], ol_flags);
-		mbuf_field_set(mb[1], ol_flags);
-		mbuf_field_set(mb[2], ol_flags);
-		mbuf_field_set(mb[3], ol_flags);
 		r -= 4;
 	}
 
 	for ( ; i < nb; i++) {
-		if (i < nb - 1)
+		if (i < (nb - 1))
 			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
 		mb[0] = pkts[i];
 		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct rte_ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
 		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+		/* invoke field_set as it is on separate cacheline */
+		mbuf_field_set(mb[0], ol_flags);
 		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
 		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
-
-		mbuf_field_set(mb[0], ol_flags);
 	}
 }
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-07-25 12:52 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-07-13 15:19 [PATCH] app/testpmd: improve sse based macswap Vipin Varghese
2024-07-15 15:07 ` Bruce Richardson
2024-07-16  6:37 Vipin Varghese
2024-07-23 16:45 ` Ferruh Yigit
2024-07-23 17:12   ` Bruce Richardson
2024-07-25 12:47     ` Varghese, Vipin
2024-07-25 12:52       ` Bruce Richardson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).