DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
@ 2025-10-11 11:29 Shreesh Adiga
  0 siblings, 0 replies; 9+ messages in thread
From: Shreesh Adiga @ 2025-10-11 11:29 UTC (permalink / raw)
  To: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh; +Cc: dev

Replace the clearing of lower 32 bits of XMM register with blend of
zero register.
Remove the clearing of upper 64 bits of tmp1 as it is redundant.
tmp1 after clearing upper bits was being xor with tmp2 before the
bits 96:65 from tmp2 were returned. The xor operation of bits 96:65
remains unchanged due to tmp1 having bits 96:64 cleared to 0.
After removing the xor operation, the clearing of upper 64 bits of tmp1
becomes redundant and hence can be removed.
Clang is able to optimize away the AND + memory operand with the
above sequence, however GCC is still emitting the code for AND with
memory operands which is being explicitly eliminated here.

Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
shf_table which is 32 bytes, achieving the same functionality.

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
Changes since v1:
Reversed the operands in the blend operation for readability.
Removed tmp1 operations that are not affecting the result and hence
avoid clearing the upper 64 bits for tmp1.

 lib/net/net_crc_sse.c | 30 ++++++------------------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index 112dc94ac1..e590aeb5ac 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -96,35 +96,24 @@ crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
 static __rte_always_inline uint32_t
 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
 {
-	static const alignas(16) uint32_t mask1[4] = {
-		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
-	};
-
-	static const alignas(16) uint32_t mask2[4] = {
-		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
-	};
 	__m128i tmp0, tmp1, tmp2;

-	tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
+	tmp0 = _mm_blend_epi16(data64, _mm_setzero_si128(), 0x3);

 	tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 	tmp1 = _mm_xor_si128(tmp1, tmp0);
-	tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));

 	tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
-	tmp2 = _mm_xor_si128(tmp2, tmp1);
 	tmp2 = _mm_xor_si128(tmp2, tmp0);

 	return _mm_extract_epi32(tmp2, 2);
 }

-static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+static const alignas(16) uint8_t crc_xmm_shift_tab[32] = {
+	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 };

 /**
@@ -216,19 +205,12 @@ crc32_eth_calc_pclmulqdq(
 			0x80808080, 0x80808080, 0x80808080, 0x80808080
 		};

-		const alignas(16) uint8_t shf_table[32] = {
-			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
-			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-		};
-
 		__m128i last16, a, b;

 		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);

 		temp = _mm_loadu_si128((const __m128i *)
-			&shf_table[data_len & 15]);
+			&crc_xmm_shift_tab[data_len & 15]);
 		a = _mm_shuffle_epi8(fold, temp);

 		temp = _mm_xor_si128(temp,
--
2.49.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-10-09 16:41 ` Bruce Richardson
@ 2025-10-10 14:31   ` Shreesh Adiga
  0 siblings, 0 replies; 9+ messages in thread
From: Shreesh Adiga @ 2025-10-10 14:31 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: Konstantin Ananyev, Jasvinder Singh, dev

[-- Attachment #1: Type: text/plain, Size: 6199 bytes --]

On Thu, Oct 9, 2025 at 10:11 PM Bruce Richardson <bruce.richardson@intel.com>
wrote:

> On Wed, Jul 16, 2025 at 04:04:39PM +0530, Shreesh Adiga wrote:
> > Replace the clearing of lower 32 bits of XMM register with blend of
> > zero register.
> > Replace the clearing of upper 64 bits of XMM register with
> _mm_move_epi64.
> > Clang is able to optimize away the AND + memory operand with the
> > above sequence, however GCC is still emitting the code for AND with
> > memory operands which is being explicitly eliminated here.
> >
> > Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> > shf_table which is 32 bytes, achieving the same functionality.
> >
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > ---
> >  lib/net/net_crc_sse.c | 30 +++++++-----------------------
> >  1 file changed, 7 insertions(+), 23 deletions(-)
> >
>
> See inline below. Changes to the reduce_64_to_32 look ok, I don't know
> enough to understand fully the other changes you made. Maybe split the
> patch into two patches for review and merge separately?
>
> /Bruce
>
> > diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
> > index 112dc94ac1..eec854e587 100644
> > --- a/lib/net/net_crc_sse.c
> > +++ b/lib/net/net_crc_sse.c
> > @@ -96,20 +96,13 @@ crcr32_reduce_128_to_64(__m128i data128, __m128i
> precomp)
> >  static __rte_always_inline uint32_t
> >  crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
> >  {
> > -     static const alignas(16) uint32_t mask1[4] = {
> > -             0xffffffff, 0xffffffff, 0x00000000, 0x00000000
> > -     };
> > -
> > -     static const alignas(16) uint32_t mask2[4] = {
> > -             0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
> > -     };
> >       __m128i tmp0, tmp1, tmp2;
> >
> > -     tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i
> *)mask2));
> > +     tmp0 = _mm_blend_epi16(_mm_setzero_si128(), data64, 252);
>
> Minor nit: 252 would be better in hex to make it clearer that it's the
> lower two bits are unset. Even better, how about switching the operands so
> that the constant is just "3", which is clearer again.
>
Okay I will update it with your suggestion in the next patch.


> >
> >       tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
> >       tmp1 = _mm_xor_si128(tmp1, tmp0);
> > -     tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
> > +     tmp1 = _mm_move_epi64(tmp1);
> >
>
> This change LGTM.
>
> >       tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
> >       tmp2 = _mm_xor_si128(tmp2, tmp1);
> > @@ -118,13 +111,11 @@ crcr32_reduce_64_to_32(__m128i data64, __m128i
> precomp)
> >       return _mm_extract_epi32(tmp2, 2);
> >  }
> >
> > -static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
> > -     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > -     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +static const alignas(16) uint8_t crc_xmm_shift_tab[32] = {
> > +     0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
> > +     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
> >       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> > -     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
> > -     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > -     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
> > +     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
> >  };
> >
>
> Can you perhaps explain how changing this table doesn't break existing uses
> of the table as it now is in the code? Specifically, does xmm_shift_left
> function not now have different behaviour?
>
Sure, crc_xmm_shift_tab is only used inside xmm_shift_left which is only
used when
the total data_len is < 16. We call xmm_shift_left(fold, 8 - data_len) when
len <= 4 and
xmm_shift_left(fold, 16 - data_len) when 5 <= len <= 15. This results in
accessing
crc_xmm_shift_tab between 1 and 31, i.e. element 0 is never accessed.

Now if we take a specific case of xmm_shift_left(fold, 10), then previously
the shuffle register
would get loaded with (crc_xmm_shift_tab + 16 - 10) which would be
crc_xmm_shift_tab + 6:
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01,
0x02, 0x03, 0x04, 0x05} which
when used with PSHUFB would result in first 10 bytes of reg being 0 and the
lower 6
elements moving left: {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, d1, d2, d3, d4, d5,
d6}. The 0 get inserted
because PSHUFB with index > 0x7f (MSB set) results in 0.

Now since we have replaced the contents of crc_xmm_shift_tab
with shf_table, we will load:
{0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01,
0x02, 0x03, 0x04, 0x05}
into the index register. Since the first 10 elements have MSB set, PSHUFB
will again result in the
same vector above with first 10 elements being zeroes and the 6 moving left
as intended.
Since xmm_shift_left is called with num between 11 and 1, we don't
access crc_xmm_shift_tab[0]
and the remaining elements 0x8{i} behave identically to 0xff when used with
PSHUFB. Thus
the xmm_shift_left behavior for currently used num values inside this file
has identical behavior.


>
> >  /**
> > @@ -216,19 +207,12 @@ crc32_eth_calc_pclmulqdq(
> >                       0x80808080, 0x80808080, 0x80808080, 0x80808080
> >               };
> >
> > -             const alignas(16) uint8_t shf_table[32] = {
> > -                     0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
> > -                     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
> > -                     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> > -                     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
> > -             };
> > -
> >               __m128i last16, a, b;
> >
> >               last16 = _mm_loadu_si128((const __m128i *)&data[data_len -
> 16]);
> >
> >               temp = _mm_loadu_si128((const __m128i *)
> > -                     &shf_table[data_len & 15]);
> > +                     &crc_xmm_shift_tab[data_len & 15]);
> >               a = _mm_shuffle_epi8(fold, temp);
> >
> >               temp = _mm_xor_si128(temp,
> > --
> > 2.49.1
> >
>

[-- Attachment #2: Type: text/html, Size: 7859 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-07-16 10:34 Shreesh Adiga
  2025-09-24 14:58 ` Thomas Monjalon
@ 2025-10-09 16:41 ` Bruce Richardson
  2025-10-10 14:31   ` Shreesh Adiga
  1 sibling, 1 reply; 9+ messages in thread
From: Bruce Richardson @ 2025-10-09 16:41 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Konstantin Ananyev, Jasvinder Singh, dev

On Wed, Jul 16, 2025 at 04:04:39PM +0530, Shreesh Adiga wrote:
> Replace the clearing of lower 32 bits of XMM register with blend of
> zero register.
> Replace the clearing of upper 64 bits of XMM register with _mm_move_epi64.
> Clang is able to optimize away the AND + memory operand with the
> above sequence, however GCC is still emitting the code for AND with
> memory operands which is being explicitly eliminated here.
> 
> Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> shf_table which is 32 bytes, achieving the same functionality.
> 
> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> ---
>  lib/net/net_crc_sse.c | 30 +++++++-----------------------
>  1 file changed, 7 insertions(+), 23 deletions(-)
> 

See inline below. Changes to the reduce_64_to_32 look ok, I don't know
enough to understand fully the other changes you made. Maybe split the
patch into two patches for review and merge separately?

/Bruce

> diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
> index 112dc94ac1..eec854e587 100644
> --- a/lib/net/net_crc_sse.c
> +++ b/lib/net/net_crc_sse.c
> @@ -96,20 +96,13 @@ crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
>  static __rte_always_inline uint32_t
>  crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
>  {
> -	static const alignas(16) uint32_t mask1[4] = {
> -		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
> -	};
> -
> -	static const alignas(16) uint32_t mask2[4] = {
> -		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
> -	};
>  	__m128i tmp0, tmp1, tmp2;
>  
> -	tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
> +	tmp0 = _mm_blend_epi16(_mm_setzero_si128(), data64, 252);

Minor nit: 252 would be better in hex to make it clearer that it's the
lower two bits are unset. Even better, how about switching the operands so
that the constant is just "3", which is clearer again.

>  
>  	tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
>  	tmp1 = _mm_xor_si128(tmp1, tmp0);
> -	tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
> +	tmp1 = _mm_move_epi64(tmp1);
>  

This change LGTM.

>  	tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
>  	tmp2 = _mm_xor_si128(tmp2, tmp1);
> @@ -118,13 +111,11 @@ crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
>  	return _mm_extract_epi32(tmp2, 2);
>  }
>  
> -static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
> -	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> -	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> +static const alignas(16) uint8_t crc_xmm_shift_tab[32] = {
> +	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
> +	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
>  	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> -	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
> -	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> -	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
> +	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
>  };
>  

Can you perhaps explain how changing this table doesn't break existing uses
of the table as it now is in the code? Specifically, does xmm_shift_left
function not now have different behaviour?

>  /**
> @@ -216,19 +207,12 @@ crc32_eth_calc_pclmulqdq(
>  			0x80808080, 0x80808080, 0x80808080, 0x80808080
>  		};
>  
> -		const alignas(16) uint8_t shf_table[32] = {
> -			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
> -			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
> -			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> -			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
> -		};
> -
>  		__m128i last16, a, b;
>  
>  		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
>  
>  		temp = _mm_loadu_si128((const __m128i *)
> -			&shf_table[data_len & 15]);
> +			&crc_xmm_shift_tab[data_len & 15]);
>  		a = _mm_shuffle_epi8(fold, temp);
>  
>  		temp = _mm_xor_si128(temp,
> -- 
> 2.49.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-10-01 10:24       ` Shreesh Adiga
@ 2025-10-01 12:16         ` Thomas Monjalon
  0 siblings, 0 replies; 9+ messages in thread
From: Thomas Monjalon @ 2025-10-01 12:16 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev

01/10/2025 12:24, Shreesh Adiga:
> On Wed, Oct 1, 2025 at 1:25 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> 
> > 29/09/2025 18:28, Shreesh Adiga:
> > > On Wed, Sep 24, 2025 at 8:28 PM Thomas Monjalon <thomas@monjalon.net>
> > wrote:
> > >
> > > > Hello,
> > > >
> > > > 16/07/2025 12:34, Shreesh Adiga:
> > > > > Replace the clearing of lower 32 bits of XMM register with blend of
> > > > > zero register.
> > > > > Replace the clearing of upper 64 bits of XMM register with
> > > > _mm_move_epi64.
> > > > > Clang is able to optimize away the AND + memory operand with the
> > > > > above sequence, however GCC is still emitting the code for AND with
> > > > > memory operands which is being explicitly eliminated here.
> > > > >
> > > > > Additionally replace the 48 byte crc_xmm_shift_tab with the contents
> > of
> > > > > shf_table which is 32 bytes, achieving the same functionality.
> > > > >
> > > > > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > > >
> > > > Sorry I'm not following.
> > > > Please could you start with defining the goal of this patch?
> > > > Is it a code simplification or a performance optimization?
> > >
> > > It is intended to be a minor performance optimization.
> >
> > Please could you give some performance numbers in the commit log?
> >
> I don't think that this change can be reliably measured. The changes only
> impact
> the last stage crc 64 to 32 fold and the last 16 bytes computation. The
> impact will only
> be a couple of clock cycles at best. Reducing the static array usage also I
> don't know
> if it can be reliably measured especially since it is not affecting the
> main loop.
> This patch can be ignored if minor incremental changes are not desirable.

Minor changes are desirable.
I'm just asking to understand the real impact of the change.
In general when doing an optimization we try to test it
and give some numbers.
Note we have a unit test for CRC in app/test/test_crc.c
but no performance test for it.

I'll wait for a review from an x86 maintainer.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-10-01  7:55     ` Thomas Monjalon
@ 2025-10-01 10:24       ` Shreesh Adiga
  2025-10-01 12:16         ` Thomas Monjalon
  0 siblings, 1 reply; 9+ messages in thread
From: Shreesh Adiga @ 2025-10-01 10:24 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev

[-- Attachment #1: Type: text/plain, Size: 1659 bytes --]

On Wed, Oct 1, 2025 at 1:25 PM Thomas Monjalon <thomas@monjalon.net> wrote:

> 29/09/2025 18:28, Shreesh Adiga:
> > On Wed, Sep 24, 2025 at 8:28 PM Thomas Monjalon <thomas@monjalon.net>
> wrote:
> >
> > > Hello,
> > >
> > > 16/07/2025 12:34, Shreesh Adiga:
> > > > Replace the clearing of lower 32 bits of XMM register with blend of
> > > > zero register.
> > > > Replace the clearing of upper 64 bits of XMM register with
> > > _mm_move_epi64.
> > > > Clang is able to optimize away the AND + memory operand with the
> > > > above sequence, however GCC is still emitting the code for AND with
> > > > memory operands which is being explicitly eliminated here.
> > > >
> > > > Additionally replace the 48 byte crc_xmm_shift_tab with the contents
> of
> > > > shf_table which is 32 bytes, achieving the same functionality.
> > > >
> > > > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > >
> > > Sorry I'm not following.
> > > Please could you start with defining the goal of this patch?
> > > Is it a code simplification or a performance optimization?
> >
> > It is intended to be a minor performance optimization.
>
> Please could you give some performance numbers in the commit log?
>
I don't think that this change can be reliably measured. The changes only
impact
the last stage crc 64 to 32 fold and the last 16 bytes computation. The
impact will only
be a couple of clock cycles at best. Reducing the static array usage also I
don't know
if it can be reliably measured especially since it is not affecting the
main loop.
This patch can be ignored if minor incremental changes are not desirable.

[-- Attachment #2: Type: text/html, Size: 2388 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-09-29 16:28   ` Shreesh Adiga
@ 2025-10-01  7:55     ` Thomas Monjalon
  2025-10-01 10:24       ` Shreesh Adiga
  0 siblings, 1 reply; 9+ messages in thread
From: Thomas Monjalon @ 2025-10-01  7:55 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev

29/09/2025 18:28, Shreesh Adiga:
> On Wed, Sep 24, 2025 at 8:28 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> 
> > Hello,
> >
> > 16/07/2025 12:34, Shreesh Adiga:
> > > Replace the clearing of lower 32 bits of XMM register with blend of
> > > zero register.
> > > Replace the clearing of upper 64 bits of XMM register with
> > _mm_move_epi64.
> > > Clang is able to optimize away the AND + memory operand with the
> > > above sequence, however GCC is still emitting the code for AND with
> > > memory operands which is being explicitly eliminated here.
> > >
> > > Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> > > shf_table which is 32 bytes, achieving the same functionality.
> > >
> > > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> >
> > Sorry I'm not following.
> > Please could you start with defining the goal of this patch?
> > Is it a code simplification or a performance optimization?
> 
> It is intended to be a minor performance optimization.

Please could you give some performance numbers in the commit log?




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-09-24 14:58 ` Thomas Monjalon
@ 2025-09-29 16:28   ` Shreesh Adiga
  2025-10-01  7:55     ` Thomas Monjalon
  0 siblings, 1 reply; 9+ messages in thread
From: Shreesh Adiga @ 2025-09-29 16:28 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev

[-- Attachment #1: Type: text/plain, Size: 948 bytes --]

On Wed, Sep 24, 2025 at 8:28 PM Thomas Monjalon <thomas@monjalon.net> wrote:

> Hello,
>
> 16/07/2025 12:34, Shreesh Adiga:
> > Replace the clearing of lower 32 bits of XMM register with blend of
> > zero register.
> > Replace the clearing of upper 64 bits of XMM register with
> _mm_move_epi64.
> > Clang is able to optimize away the AND + memory operand with the
> > above sequence, however GCC is still emitting the code for AND with
> > memory operands which is being explicitly eliminated here.
> >
> > Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> > shf_table which is 32 bytes, achieving the same functionality.
> >
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
>
> Sorry I'm not following.
> Please could you start with defining the goal of this patch?
> Is it a code simplification or a performance optimization?

It is intended to be a minor performance optimization.

[-- Attachment #2: Type: text/html, Size: 1391 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
  2025-07-16 10:34 Shreesh Adiga
@ 2025-09-24 14:58 ` Thomas Monjalon
  2025-09-29 16:28   ` Shreesh Adiga
  2025-10-09 16:41 ` Bruce Richardson
  1 sibling, 1 reply; 9+ messages in thread
From: Thomas Monjalon @ 2025-09-24 14:58 UTC (permalink / raw)
  To: Shreesh Adiga; +Cc: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh, dev

Hello,

16/07/2025 12:34, Shreesh Adiga:
> Replace the clearing of lower 32 bits of XMM register with blend of
> zero register.
> Replace the clearing of upper 64 bits of XMM register with _mm_move_epi64.
> Clang is able to optimize away the AND + memory operand with the
> above sequence, however GCC is still emitting the code for AND with
> memory operands which is being explicitly eliminated here.
> 
> Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
> shf_table which is 32 bytes, achieving the same functionality.
> 
> Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>

Sorry I'm not following.
Please could you start with defining the goal of this patch?
Is it a code simplification or a performance optimization?



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
@ 2025-07-16 10:34 Shreesh Adiga
  2025-09-24 14:58 ` Thomas Monjalon
  2025-10-09 16:41 ` Bruce Richardson
  0 siblings, 2 replies; 9+ messages in thread
From: Shreesh Adiga @ 2025-07-16 10:34 UTC (permalink / raw)
  To: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh; +Cc: dev

Replace the clearing of lower 32 bits of XMM register with blend of
zero register.
Replace the clearing of upper 64 bits of XMM register with _mm_move_epi64.
Clang is able to optimize away the AND + memory operand with the
above sequence, however GCC is still emitting the code for AND with
memory operands which is being explicitly eliminated here.

Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
shf_table which is 32 bytes, achieving the same functionality.

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
 lib/net/net_crc_sse.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index 112dc94ac1..eec854e587 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -96,20 +96,13 @@ crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
 static __rte_always_inline uint32_t
 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
 {
-	static const alignas(16) uint32_t mask1[4] = {
-		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
-	};
-
-	static const alignas(16) uint32_t mask2[4] = {
-		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
-	};
 	__m128i tmp0, tmp1, tmp2;
 
-	tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
+	tmp0 = _mm_blend_epi16(_mm_setzero_si128(), data64, 252);
 
 	tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 	tmp1 = _mm_xor_si128(tmp1, tmp0);
-	tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
+	tmp1 = _mm_move_epi64(tmp1);
 
 	tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
 	tmp2 = _mm_xor_si128(tmp2, tmp1);
@@ -118,13 +111,11 @@ crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
 	return _mm_extract_epi32(tmp2, 2);
 }
 
-static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+static const alignas(16) uint8_t crc_xmm_shift_tab[32] = {
+	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 };
 
 /**
@@ -216,19 +207,12 @@ crc32_eth_calc_pclmulqdq(
 			0x80808080, 0x80808080, 0x80808080, 0x80808080
 		};
 
-		const alignas(16) uint8_t shf_table[32] = {
-			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
-			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-		};
-
 		__m128i last16, a, b;
 
 		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
 
 		temp = _mm_loadu_si128((const __m128i *)
-			&shf_table[data_len & 15]);
+			&crc_xmm_shift_tab[data_len & 15]);
 		a = _mm_shuffle_epi8(fold, temp);
 
 		temp = _mm_xor_si128(temp,
-- 
2.49.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-10-11 11:32 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-11 11:29 [PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c Shreesh Adiga
  -- strict thread matches above, loose matches on Subject: below --
2025-07-16 10:34 Shreesh Adiga
2025-09-24 14:58 ` Thomas Monjalon
2025-09-29 16:28   ` Shreesh Adiga
2025-10-01  7:55     ` Thomas Monjalon
2025-10-01 10:24       ` Shreesh Adiga
2025-10-01 12:16         ` Thomas Monjalon
2025-10-09 16:41 ` Bruce Richardson
2025-10-10 14:31   ` Shreesh Adiga

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).