[dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC
@ 2020-09-10 12:01 Mairtin o Loingsigh
  2020-09-10 12:27 ` Bruce Richardson
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Mairtin o Loingsigh @ 2020-09-10 12:01 UTC (permalink / raw)
  To: jasvinder.singh
  Cc: dev, brendan.ryan, David.Coyle, pablo.de.lara.guarch,
	Mairtin o Loingsigh

This patch enables the generation of CRC using AVX512 instruction
set when available on the host platform.

Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
---

v1:
* Initial version, with AVX512 support for CRC32 Ethernet only
 (requires further updates)
  * AVX512 support for CRC16-CCITT and final implementation of
    CRC32 Ethernet will be added in v2
---
 doc/guides/rel_notes/release_20_11.rst |    4 +
 lib/librte_net/net_crc_avx.h           |  331 ++++++++++++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c           |   23 ++-
 lib/librte_net/rte_net_crc.h           |    1 +
 4 files changed, 358 insertions(+), 1 deletions(-)
 create mode 100644 lib/librte_net/net_crc_avx.h

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index df227a1..d6a84ca 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -55,6 +55,10 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Added support for AVX512 in rte_net CRC calculations.**
+
+  Added new CRC32 calculation code using AVX512 instruction set
+  Added new CRC16-CCITT calculation code using AVX512 instruction set
 
 Removed Items
 -------------
diff --git a/lib/librte_net/net_crc_avx.h b/lib/librte_net/net_crc_avx.h
new file mode 100644
index 0000000..d9481d5
--- /dev/null
+++ b/lib/librte_net/net_crc_avx.h
@@ -0,0 +1,331 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_NET_CRC_AVX_H_
+#define _RTE_NET_CRC_AVX_H_
+
+#include <rte_branch_prediction.h>
+
+#include <rte_vect.h>
+#include <immintrin.h>
+#include <x86intrin.h>
+#include <cpuid.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PCLMULQDQ CRC computation context structure */
+struct crc_pclmulqdq512_ctx {
+	__m512i rk1_rk2;
+	__m512i rk3_rk4;
+	__m512i rk5_rk6;
+	__m512i rk7_rk8;
+};
+
+static struct crc_pclmulqdq512_ctx crc32_eth_pclmulqdq __rte_aligned(16);
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_64BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block
+ *   64 byte data block
+ * @param precomp
+ *   Precomputed rk1 constant
+ * @param fold
+ *   Current16 byte folded data
+ *
+ * @return
+ *   New 16 byte folded data
+ */
+static __rte_always_inline __m512i
+crcr32_folding_round(__m512i data_block,
+		__m512i precomp,
+		__m512i fold)
+{
+	__m512i tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
+	__m512i tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
+
+	return _mm512_xor_si512(tmp1, _mm512_xor_si512(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128
+ *   128 bits data to be reduced
+ * @param precomp
+ *   precomputed constants rk5, rk6
+ *
+ * @return
+ *  64 bits reduced data
+ */
+
+static __rte_always_inline __m128i
+crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
+{
+	__m128i tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
+	tmp1 = _mm_srli_si128(data128, 8);
+	tmp0 = _mm_xor_si128(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = _mm_slli_si128(tmp0, 4);
+	tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
+
+	return _mm_xor_si128(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64
+ *   64 bits data to be reduced
+ * @param precomp
+ *   rk7 precomputed constant
+ *
+ * @return
+ *   reduced 32 bits data
+ */
+
+static __rte_always_inline uint32_t
+crcr32_reduce_64_to_32(__m512i data64, __m512i precomp)
+{
+	static const uint32_t mask1[4] __rte_aligned(64) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+
+	static const uint32_t mask2[4] __rte_aligned(64) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	__m512i tmp0, tmp1, tmp2;
+
+	tmp0 = _mm512_and_si512(data64, _mm512_load_si512(
+		(const __m512i *)mask2));
+
+	tmp1 = _mm512_clmulepi64_epi128(tmp0, precomp, 0x00);
+	tmp1 = _mm512_xor_si512(tmp1, tmp0);
+	tmp1 = _mm512_and_si512(tmp1, _mm512_load_si512(
+		(const __m128i *)mask1));
+
+	tmp2 = _mm512_clmulepi64_epi128(tmp1, precomp, 0x10);
+	tmp2 = _mm512_xor_si512(tmp2, tmp1);
+	tmp2 = _mm512_xor_si512(tmp2, tmp0);
+
+	return 0;
+}
+
+static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(64) = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg
+ *   128 bit value
+ * @param num
+ *   number of bytes to shift left reg by (0-16)
+ *
+ * @return
+ *   reg << (num * 8)
+ */
+
+static __rte_always_inline __m512i
+xmm_shift_left(__m512i reg, const unsigned int num)
+{
+	const __m512i *p = (const __m512i *)(crc_xmm_shift_tab + 16 - num);
+
+	/* TODO: Check unaligned load*/
+	return _mm512_shuffle_epi8(reg, _mm512_load_si512(p));
+}
+
+static __rte_always_inline uint32_t
+crc32_eth_calc_pclmulqdq(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pclmulqdq512_ctx *params)
+{
+	__m256i b;
+	__m512i temp, k;
+	__m512i qw0 = _mm512_set1_epi64(0);
+	__m512i fold0;
+	uint32_t n;
+
+	/* Get CRC init value */
+	b = _mm256_insert_epi32(_mm256_setzero_si256(), crc, 0);
+	temp = _mm512_inserti32x8(_mm512_setzero_si512(), b, 0);
+
+	/* align data to 16B*/
+	if (unlikely(data_len < 64)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			/* TODO: Unaligned load not working */
+			fold0 = _mm512_load_epi64((const __m512i *)data);
+			fold0 = _mm512_xor_si512(fold0, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold0 = _mm512_load_si512((const __m128i *)buffer);
+			fold0 = _mm512_xor_si512(fold0, temp);
+			if (unlikely(data_len < 4)) {
+				fold0 = xmm_shift_left(fold0, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold0 = xmm_shift_left(fold0, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold0 = _mm512_loadu_si512((const __m512i *)data);
+		fold0 = _mm512_xor_si512(fold0, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/*Loop of folds*/
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold0 = _mm512_loadu_si512((const __m512i *)data);
+	fold0 = _mm512_xor_si512(fold0, temp);
+
+	/** Main folding loop - the last 32 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 64; (n + 64) <= data_len; n += 64) {
+		qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
+		fold0 = crcr32_folding_round(qw0, k, fold0);
+	}
+
+	/* 256 to 128 fold */
+	/* Check this */
+	k = params->rk3_rk4;
+	fold0 = crcr32_folding_round(temp, k, fold0);
+	n += 64;
+
+	/* Remainder */
+partial_bytes:
+	if (likely(n < data_len)) {
+
+		const uint32_t mask3[4] __rte_aligned(16) = {
+			0x80808080, 0x80808080, 0x80808080, 0x80808080
+		};
+
+		const uint8_t shf_table[32] __rte_aligned(16) = {
+			0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+			0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+		};
+
+		__m128i last16;
+		__m512i a, b;
+
+		last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
+
+		RTE_SET_USED(last16);
+
+		temp = _mm512_loadu_si512((const __m512i *)
+			&shf_table[data_len & 15]);
+		a = _mm512_shuffle_epi8(fold0, temp);
+
+		temp = _mm512_xor_si512(temp,
+			_mm512_load_si512((const __m512i *)mask3));
+		b = _mm512_shuffle_epi8(fold0, temp);
+
+		/* k = rk1 & rk2 */
+		temp = _mm512_clmulepi64_epi128(a, k, 0x01);
+		fold0 = _mm512_clmulepi64_epi128(a, k, 0x10);
+
+		fold0 = _mm512_xor_si512(fold0, temp);
+		fold0 = _mm512_xor_si512(fold0, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold0, k);
+
+	return n;
+}
+
+
+static inline void
+rte_net_crc_avx512_init(void)
+{
+	__m128i a;
+	uint64_t k1, k2, k3, k4, k5, k6;
+	uint64_t p = 0, q = 0;
+
+	/** Initialize CRC32 data */
+	/* 256 fold constants*/
+	k1 = 0xe95c1271LLU;
+	k2 = 0xce3371cbLLU;
+
+	/*256 - 128 fold constants */
+	k3 = 0x910eeec1LLU;
+	k4 = 0x33fff533LLU;
+
+	k5 = 0xccaa009eLLU;
+	k6 = 0x163cd6124LLU;
+	q =  0x1f7011640LLU;
+	p =  0x1db710641LLU;
+
+	/** Save the params in context structure */
+	a = _mm_set_epi64x(k2, k1);
+	crc32_eth_pclmulqdq.rk1_rk2 = _mm512_broadcast_i32x4(a);
+	crc32_eth_pclmulqdq.rk3_rk4 = _mm512_setr_epi64(
+		k3, k4, 0, 0, 0, 0, 0, 0);
+	crc32_eth_pclmulqdq.rk5_rk6 = _mm512_setr_epi64(
+		k5, k6, 0, 0, 0, 0, 0, 0);
+	crc32_eth_pclmulqdq.rk7_rk8 = _mm512_setr_epi64(
+		q, p, 0, 0, 0, 0, 0, 0);
+	/**
+	 * Reset the register as following calculation may
+	 * use other data types such as float, double, etc.
+	 */
+	_mm_empty();
+
+}
+
+static inline uint32_t
+rte_crc32_eth_avx512_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pclmulqdq(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pclmulqdq);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_NET_CRC_AVX_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 9fd4794..b2b2bc1 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -10,12 +10,18 @@
 #include <rte_common.h>
 #include <rte_net_crc.h>
 
-#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
+#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \
+	&& defined(RTE_MACHINE_CPUFLAG_AVX512F)
+#define X86_64_AVX512F_PCLMULQDQ     1
+#elif defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
 #elif defined(RTE_ARCH_ARM64) && defined(RTE_MACHINE_CPUFLAG_PMULL)
 #define ARM64_NEON_PMULL           1
 #endif
 
+#ifdef X86_64_AVX512F_PCLMULQDQ
+#include <net_crc_avx.h>
+#endif
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
 #elif defined ARM64_NEON_PMULL
@@ -48,6 +54,12 @@
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_handler,
 };
 
+#ifdef X86_64_AVX512F_PCLMULQDQ
+static rte_net_crc_handler handlers_avx512[] = {
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_avx512_handler,
+};
+#endif
+
 #ifdef X86_64_SSE42_PCLMULQDQ
 static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
@@ -157,6 +169,11 @@
 			handlers = handlers_neon;
 			break;
 		}
+#elif defined X86_64_AVX512F_PCLMULQDQ
+		/* fall-through */
+	case RTE_NET_CRC_AVX512:
+			handlers = handlers_avx512;
+			break;
 #endif
 		/* fall-through */
 	case RTE_NET_CRC_SCALAR:
@@ -197,6 +214,10 @@
 		rte_net_crc_neon_init();
 	}
 #endif
+#ifdef X86_64_AVX512F_PCLMULQDQ
+	alg = RTE_NET_CRC_AVX512;
+	rte_net_crc_avx512_init();
+#endif
 
 	rte_net_crc_set_alg(alg);
 }
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 16e85ca..a7d2ed0 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -23,6 +23,7 @@ enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
 	RTE_NET_CRC_NEON,
+	RTE_NET_CRC_AVX512,
 };
 
 /**
-- 
1.7.0.7


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC
  2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh
@ 2020-09-10 12:27 ` Bruce Richardson
  2020-09-10 12:52   ` O'loingsigh, Mairtin
  2020-09-29 15:45   ` O'loingsigh, Mairtin
  2020-09-11  9:57 ` De Lara Guarch, Pablo
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 10+ messages in thread
From: Bruce Richardson @ 2020-09-10 12:27 UTC (permalink / raw)
  To: Mairtin o Loingsigh
  Cc: jasvinder.singh, dev, brendan.ryan, David.Coyle, pablo.de.lara.guarch

On Thu, Sep 10, 2020 at 01:01:11PM +0100, Mairtin o Loingsigh wrote:
> This patch enables the generation of CRC using AVX512 instruction
> set when available on the host platform.
> 
> Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
> ---
> 
> v1:
> * Initial version, with AVX512 support for CRC32 Ethernet only
>  (requires further updates)
>   * AVX512 support for CRC16-CCITT and final implementation of
>     CRC32 Ethernet will be added in v2
> ---
>  doc/guides/rel_notes/release_20_11.rst |    4 +
>  lib/librte_net/net_crc_avx.h           |  331 ++++++++++++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c           |   23 ++-
>  lib/librte_net/rte_net_crc.h           |    1 +
>  4 files changed, 358 insertions(+), 1 deletions(-)
>  create mode 100644 lib/librte_net/net_crc_avx.h
> 
<snip>
> --- a/lib/librte_net/rte_net_crc.c
> +++ b/lib/librte_net/rte_net_crc.c
> @@ -10,12 +10,18 @@
>  #include <rte_common.h>
>  #include <rte_net_crc.h>
>  
> -#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
> +#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \
> +	&& defined(RTE_MACHINE_CPUFLAG_AVX512F)
> +#define X86_64_AVX512F_PCLMULQDQ     1
> +#elif defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)

This all seems to be build-time selection of path. Can you perhaps
investigate adding runtime selection instead, so that this can be used from
distro packages, or DPDK compiled on older systems but used on newer.
See also patchset: http://patches.dpdk.org/project/dpdk/list/?series=11831
which is relevant to this too.

/Bruce

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC
  2020-09-10 12:27 ` Bruce Richardson
@ 2020-09-10 12:52   ` O'loingsigh, Mairtin
  2020-09-29 15:45   ` O'loingsigh, Mairtin
  1 sibling, 0 replies; 10+ messages in thread
From: O'loingsigh, Mairtin @ 2020-09-10 12:52 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: Singh, Jasvinder, dev, Ryan, Brendan, Coyle, David,
	De Lara Guarch, Pablo



> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Thursday, September 10, 2020 1:28 PM
> To: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>
> Cc: Singh, Jasvinder <jasvinder.singh@intel.com>; dev@dpdk.org; Ryan,
> Brendan <brendan.ryan@intel.com>; Coyle, David <david.coyle@intel.com>;
> De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] net: add support for AVX512 when
> generating CRC
> 
> On Thu, Sep 10, 2020 at 01:01:11PM +0100, Mairtin o Loingsigh wrote:
> > This patch enables the generation of CRC using AVX512 instruction set
> > when available on the host platform.
> >
> > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
> > ---
> >
> > v1:
> > * Initial version, with AVX512 support for CRC32 Ethernet only
> > (requires further updates)
> >   * AVX512 support for CRC16-CCITT and final implementation of
> >     CRC32 Ethernet will be added in v2
> > ---
> >  doc/guides/rel_notes/release_20_11.rst |    4 +
> >  lib/librte_net/net_crc_avx.h           |  331
> ++++++++++++++++++++++++++++++++
> >  lib/librte_net/rte_net_crc.c           |   23 ++-
> >  lib/librte_net/rte_net_crc.h           |    1 +
> >  4 files changed, 358 insertions(+), 1 deletions(-)  create mode
> > 100644 lib/librte_net/net_crc_avx.h
> >
> <snip>
> > --- a/lib/librte_net/rte_net_crc.c
> > +++ b/lib/librte_net/rte_net_crc.c
> > @@ -10,12 +10,18 @@
> >  #include <rte_common.h>
> >  #include <rte_net_crc.h>
> >
> > -#if defined(RTE_ARCH_X86_64) &&
> > defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
> > +#if defined(RTE_ARCH_X86_64) &&
> defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \
> > +	&& defined(RTE_MACHINE_CPUFLAG_AVX512F)
> > +#define X86_64_AVX512F_PCLMULQDQ     1
> > +#elif defined(RTE_ARCH_X86_64) &&
> > +defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
> 
> This all seems to be build-time selection of path. Can you perhaps investigate
> adding runtime selection instead, so that this can be used from distro
> packages, or DPDK compiled on older systems but used on newer.
> See also patchset: http://patches.dpdk.org/project/dpdk/list/?series=11831
> which is relevant to this too.
> 
> /Bruce

Sure. I will look at options for run time selection of intrinsic path

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC
  2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh
  2020-09-10 12:27 ` Bruce Richardson
@ 2020-09-11  9:57 ` De Lara Guarch, Pablo
  2020-09-29 15:47   ` O'loingsigh, Mairtin
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: De Lara Guarch, Pablo @ 2020-09-11  9:57 UTC (permalink / raw)
  To: O'loingsigh, Mairtin, Singh, Jasvinder
  Cc: dev, Ryan, Brendan, Coyle, David

Hi Mairtin,

> -----Original Message-----
> From: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>
> Sent: Thursday, September 10, 2020 1:01 PM
> To: Singh, Jasvinder <jasvinder.singh@intel.com>
> Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, David
> <david.coyle@intel.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>; O'loingsigh, Mairtin
> <mairtin.oloingsigh@intel.com>
> Subject: [PATCH] net: add support for AVX512 when generating CRC
> 
> This patch enables the generation of CRC using AVX512 instruction set when
> available on the host platform.
> 
> Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
> ---
> 
> v1:
> * Initial version, with AVX512 support for CRC32 Ethernet only  (requires further
> updates)
>   * AVX512 support for CRC16-CCITT and final implementation of
>     CRC32 Ethernet will be added in v2
> ---
>  doc/guides/rel_notes/release_20_11.rst |    4 +
>  lib/librte_net/net_crc_avx.h           |  331 ++++++++++++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c           |   23 ++-
>  lib/librte_net/rte_net_crc.h           |    1 +
>  4 files changed, 358 insertions(+), 1 deletions(-)  create mode 100644
> lib/librte_net/net_crc_avx.h
> 
> diff --git a/doc/guides/rel_notes/release_20_11.rst
> b/doc/guides/rel_notes/release_20_11.rst
> index df227a1..d6a84ca 100644
> --- a/doc/guides/rel_notes/release_20_11.rst
> +++ b/doc/guides/rel_notes/release_20_11.rst
> @@ -55,6 +55,10 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
> 
> +* **Added support for AVX512 in rte_net CRC calculations.**
> +
> +  Added new CRC32 calculation code using AVX512 instruction set  Added
> + new CRC16-CCITT calculation code using AVX512 instruction set
> 
>  Removed Items
>  -------------
> diff --git a/lib/librte_net/net_crc_avx.h b/lib/librte_net/net_crc_avx.h new file
> mode 100644 index 0000000..d9481d5
> --- /dev/null
> +++ b/lib/librte_net/net_crc_avx.h

...

> +static __rte_always_inline uint32_t
> +crc32_eth_calc_pclmulqdq(
> +	const uint8_t *data,
> +	uint32_t data_len,
> +	uint32_t crc,
> +	const struct crc_pclmulqdq512_ctx *params) {
> +	__m256i b;
> +	__m512i temp, k;
> +	__m512i qw0 = _mm512_set1_epi64(0);
> +	__m512i fold0;
> +	uint32_t n;

This is loading 64 bytes of data, but if seems like only 16 are available, right? Should we use _mm_loadu_si128?

> +			fold0 = _mm512_xor_si512(fold0, temp);
> +			goto reduction_128_64;
> +		}
> +
> +		if (unlikely(data_len < 16)) {
> +			/* 0 to 15 bytes */
> +			uint8_t buffer[16] __rte_aligned(16);
> +
> +			memset(buffer, 0, sizeof(buffer));
> +			memcpy(buffer, data, data_len);

I would use _mm_maskz_loadu_epi8, passing a mask register with ((1 << data_len) - 1).

> +
> +			fold0 = _mm512_load_si512((const __m128i *)buffer);
> +			fold0 = _mm512_xor_si512(fold0, temp);
> +			if (unlikely(data_len < 4)) {
> +				fold0 = xmm_shift_left(fold0, 8 - data_len);
> +				goto barret_reduction;
> +			}
> +			fold0 = xmm_shift_left(fold0, 16 - data_len);
> +			goto reduction_128_64;
> +		}
> +		/* 17 to 31 bytes */
> +		fold0 = _mm512_loadu_si512((const __m512i *)data);

Same here. Looks like you are loading too much data?

> +		fold0 = _mm512_xor_si512(fold0, temp);
> +		n = 16;
> +		k = params->rk1_rk2;
> +		goto partial_bytes;
> +	}

...

> +
> +		fold0 = _mm512_xor_si512(fold0, temp);
> +		fold0 = _mm512_xor_si512(fold0, b);

You could use _mm512_ternarylogic_epi64 with 0x96 as to do 2x XORs in one instruction.

> +	}
> +
> +	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
> +reduction_128_64:
> +	k = params->rk5_rk6;
> +
> +barret_reduction:
> +	k = params->rk7_rk8;
> +	n = crcr32_reduce_64_to_32(fold0, k);
> +
> +	return n;
> +}
> +
> +


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC
  2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh
  2020-09-10 12:27 ` Bruce Richardson
  2020-09-11  9:57 ` De Lara Guarch, Pablo
@ 2020-09-29 15:12 ` Mairtin o Loingsigh
  2020-09-29 15:41   ` O'loingsigh, Mairtin
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection Mairtin o Loingsigh
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
  4 siblings, 1 reply; 10+ messages in thread
From: Mairtin o Loingsigh @ 2020-09-29 15:12 UTC (permalink / raw)
  To: jasvinder.singh, bruce.richardson, pablo.de.lara.guarch
  Cc: dev, brendan.ryan, david.coyle, Mairtin o Loingsigh

This patchset makes two significant enhancements to the CRC modules of
the rte_net library:

1) Adds run-time selection of the optimal architecture-specific CRC path.
   Previously the selection was solely made at compile-time, meaning it
   could only be built and run on the same generation of CPU. Adding
   run-time selection ability means this can be used from distro packages
   and/or DPDK can be compiled on an older CPU and run on a newer CPU.
2) Adds an optimized CRC implementation based on the AVX512 and
   VPCLMULQDQ instruction sets.
   
For further details, please see the commit messages of the individual
patches.

v2:
* Added support for run-time selection of optimal architecture-specific
  CRC, based on v1 review comment.
* Added full working AVX512/VPCLMULDQD support for CRC32-Ethernet and
  CRC16-CCITT.

v1:
* Initial version, with incomplete AVX512/VPCLMULDQD support for
  CRC32-Ethernet only.

Mairtin o Loingsigh (2):
  net: add run-time architecture specific CRC selection
  net: add support for AVX512/VPCLMULQDQ based CRC

 app/test/test_crc.c                               |  11 +-
 config/x86/meson.build                            |   6 +-
 doc/guides/rel_notes/release_20_11.rst            |   6 +
 lib/librte_net/meson.build                        |  89 ++++-
 lib/librte_net/net_crc.h                          |  45 +++
 lib/librte_net/net_crc_avx512.c                   | 424 ++++++++++++++++++++++
 lib/librte_net/{net_crc_neon.h => net_crc_neon.c} |  27 +-
 lib/librte_net/{net_crc_sse.h => net_crc_sse.c}   |  34 +-
 lib/librte_net/rte_net_crc.c                      | 100 +++--
 lib/librte_net/rte_net_crc.h                      |   4 +-
 10 files changed, 674 insertions(+), 72 deletions(-)
 create mode 100644 lib/librte_net/net_crc.h
 create mode 100644 lib/librte_net/net_crc_avx512.c
 rename lib/librte_net/{net_crc_neon.h => net_crc_neon.c} (95%)
 rename lib/librte_net/{net_crc_sse.h => net_crc_sse.c} (94%)

-- 
2.12.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection
  2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh
                   ` (2 preceding siblings ...)
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
@ 2020-09-29 15:12 ` Mairtin o Loingsigh
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
  4 siblings, 0 replies; 10+ messages in thread
From: Mairtin o Loingsigh @ 2020-09-29 15:12 UTC (permalink / raw)
  To: jasvinder.singh, bruce.richardson, pablo.de.lara.guarch
  Cc: dev, brendan.ryan, david.coyle, Mairtin o Loingsigh

This patch adds support for run-time selection of the optimal
architecture-specific CRC path, based on the supported instruction set(s)
of the CPU.

The compiler option checks have been moved from the C files to the meson
script. The rte_cpu_get_flag_enabled function is called automatically by
the library at process initialization time to determine which
instructions the CPU supports, with the most optimal supported CRC path
ultimately selected.

Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
Signed-off-by: David Coyle <david.coyle@intel.com>
---
 doc/guides/rel_notes/release_20_11.rst            |  4 ++
 lib/librte_net/meson.build                        | 34 +++++++++++-
 lib/librte_net/net_crc.h                          | 34 ++++++++++++
 lib/librte_net/{net_crc_neon.h => net_crc_neon.c} | 27 +++------
 lib/librte_net/{net_crc_sse.h => net_crc_sse.c}   | 34 ++++--------
 lib/librte_net/rte_net_crc.c                      | 67 ++++++++++++++---------
 6 files changed, 132 insertions(+), 68 deletions(-)
 create mode 100644 lib/librte_net/net_crc.h
 rename lib/librte_net/{net_crc_neon.h => net_crc_neon.c} (95%)
 rename lib/librte_net/{net_crc_sse.h => net_crc_sse.c} (94%)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index 4eb3224a7..6bd222dca 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -55,6 +55,10 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Updated CRC modules of rte_net library.**
+
+  * Added run-time selection of the optimal architecture-specific CRC path.
+
 * **Updated Cisco enic driver.**
 
   * Added support for VF representors with single-queue Tx/Rx and flow API
diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build
index 24ed8253b..b6880bd85 100644
--- a/lib/librte_net/meson.build
+++ b/lib/librte_net/meson.build
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2017 Intel Corporation
+# Copyright(c) 2017-2020 Intel Corporation
 
 headers = files('rte_ip.h',
 	'rte_tcp.h',
@@ -20,3 +20,35 @@ headers = files('rte_ip.h',
 
 sources = files('rte_arp.c', 'rte_ether.c', 'rte_net.c', 'rte_net_crc.c')
 deps += ['mbuf']
+
+if dpdk_conf.has('RTE_ARCH_X86_64')
+	net_crc_sse42_cpu_support = \
+		cc.get_define('__PCLMUL__', args: machine_args) != ''
+	net_crc_sse42_cc_support = \
+		cc.has_argument('-mpclmul') and cc.has_argument('-maes')
+
+	build_static_net_crc_sse42_lib = 0
+
+	if net_crc_sse42_cpu_support == true
+		sources += files('net_crc_sse.c')
+		cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT']
+	elif net_crc_sse42_cc_support == true
+		build_static_net_crc_sse42_lib = 1
+		net_crc_sse42_lib_cflags = ['-mpclmul', '-maes']
+		cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT']
+	endif
+
+	if build_static_net_crc_sse42_lib == 1
+		net_crc_sse42_lib = static_library(
+					'net_crc_sse42_lib',
+					'net_crc_sse.c',
+					dependencies: static_rte_eal,
+					c_args: [cflags,
+						net_crc_sse42_lib_cflags])
+		objs += net_crc_sse42_lib.extract_objects('net_crc_sse.c')
+	endif
+elif dpdk_conf.has('RTE_ARCH_ARM64') and \
+		cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != ''
+	sources += files('net_crc_neon.c')
+	cflags += ['-DCC_ARM64_NEON_PMULL_SUPPORT']
+endif
diff --git a/lib/librte_net/net_crc.h b/lib/librte_net/net_crc.h
new file mode 100644
index 000000000..a1578a56c
--- /dev/null
+++ b/lib/librte_net/net_crc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _NET_CRC_H_
+#define _NET_CRC_H_
+
+/*
+ * Different implementations of CRC
+ */
+
+/* SSE4.2 */
+
+void
+rte_net_crc_sse42_init(void);
+
+uint32_t
+rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len);
+
+uint32_t
+rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len);
+
+/* NEON */
+
+void
+rte_net_crc_neon_init(void);
+
+uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len);
+
+uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len);
+
+#endif /* _NET_CRC_H_ */
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.c
similarity index 95%
rename from lib/librte_net/net_crc_neon.h
rename to lib/librte_net/net_crc_neon.c
index 63fa1d4a1..b79684ec2 100644
--- a/lib/librte_net/net_crc_neon.h
+++ b/lib/librte_net/net_crc_neon.c
@@ -1,18 +1,17 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2020 Intel Corporation
  */
 
-#ifndef _NET_CRC_NEON_H_
-#define _NET_CRC_NEON_H_
+#include <string.h>
 
+#include <rte_common.h>
 #include <rte_branch_prediction.h>
 #include <rte_net_crc.h>
 #include <rte_vect.h>
 #include <rte_cpuflags.h>
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include "net_crc.h"
 
 /** PMULL CRC computation context structure */
 struct crc_pmull_ctx {
@@ -218,7 +217,7 @@ crc32_eth_calc_pmull(
 	return n;
 }
 
-static inline void
+void
 rte_net_crc_neon_init(void)
 {
 	/* Initialize CRC16 data */
@@ -242,9 +241,8 @@ rte_net_crc_neon_init(void)
 	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
 }
 
-static inline uint32_t
-rte_crc16_ccitt_neon_handler(const uint8_t *data,
-	uint32_t data_len)
+uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len)
 {
 	return (uint16_t)~crc32_eth_calc_pmull(data,
 		data_len,
@@ -252,18 +250,11 @@ rte_crc16_ccitt_neon_handler(const uint8_t *data,
 		&crc16_ccitt_pmull);
 }
 
-static inline uint32_t
-rte_crc32_eth_neon_handler(const uint8_t *data,
-	uint32_t data_len)
+uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len)
 {
 	return ~crc32_eth_calc_pmull(data,
 		data_len,
 		0xffffffffUL,
 		&crc32_eth_pmull);
 }
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/net_crc_sse.h b/lib/librte_net/net_crc_sse.c
similarity index 94%
rename from lib/librte_net/net_crc_sse.h
rename to lib/librte_net/net_crc_sse.c
index 1c7b7a548..053b54b39 100644
--- a/lib/librte_net/net_crc_sse.h
+++ b/lib/librte_net/net_crc_sse.c
@@ -1,18 +1,16 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017 Intel Corporation
+ * Copyright(c) 2017-2020 Intel Corporation
  */
 
-#ifndef _RTE_NET_CRC_SSE_H_
-#define _RTE_NET_CRC_SSE_H_
+#include <string.h>
 
+#include <rte_common.h>
 #include <rte_branch_prediction.h>
+#include <rte_cpuflags.h>
 
-#include <x86intrin.h>
-#include <cpuid.h>
+#include "net_crc.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include <x86intrin.h>
 
 /** PCLMULQDQ CRC computation context structure */
 struct crc_pclmulqdq_ctx {
@@ -259,8 +257,7 @@ crc32_eth_calc_pclmulqdq(
 	return n;
 }
 
-
-static inline void
+void
 rte_net_crc_sse42_init(void)
 {
 	uint64_t k1, k2, k5, k6;
@@ -303,12 +300,10 @@ rte_net_crc_sse42_init(void)
 	 * use other data types such as float, double, etc.
 	 */
 	_mm_empty();
-
 }
 
-static inline uint32_t
-rte_crc16_ccitt_sse42_handler(const uint8_t *data,
-	uint32_t data_len)
+uint32_t
+rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
 {
 	/** return 16-bit CRC value */
 	return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
@@ -317,18 +312,11 @@ rte_crc16_ccitt_sse42_handler(const uint8_t *data,
 		&crc16_ccitt_pclmulqdq);
 }
 
-static inline uint32_t
-rte_crc32_eth_sse42_handler(const uint8_t *data,
-	uint32_t data_len)
+uint32_t
+rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
 {
 	return ~crc32_eth_calc_pclmulqdq(data,
 		data_len,
 		0xffffffffUL,
 		&crc32_eth_pclmulqdq);
 }
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _RTE_NET_CRC_SSE_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 4f5b9e828..83dccbfba 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017 Intel Corporation
+ * Copyright(c) 2017-2020 Intel Corporation
  */
 
 #include <stddef.h>
@@ -10,17 +10,7 @@
 #include <rte_common.h>
 #include <rte_net_crc.h>
 
-#if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
-#define X86_64_SSE42_PCLMULQDQ     1
-#elif defined(RTE_ARCH_ARM64) && defined(__ARM_FEATURE_CRYPTO)
-#define ARM64_NEON_PMULL           1
-#endif
-
-#ifdef X86_64_SSE42_PCLMULQDQ
-#include <net_crc_sse.h>
-#elif defined ARM64_NEON_PMULL
-#include <net_crc_neon.h>
-#endif
+#include "net_crc.h"
 
 /** CRC polynomials */
 #define CRC32_ETH_POLYNOMIAL 0x04c11db7UL
@@ -47,13 +37,13 @@ static rte_net_crc_handler handlers_scalar[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_handler,
 };
-
-#ifdef X86_64_SSE42_PCLMULQDQ
+#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
 static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
-#elif defined ARM64_NEON_PMULL
+#endif
+#ifdef CC_ARM64_NEON_PMULL_SUPPORT
 static rte_net_crc_handler handlers_neon[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
@@ -142,22 +132,44 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
 		crc32_eth_lut);
 }
 
+#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
+static uint8_t
+sse42_pclmulqdq_cpu_supported(void)
+{
+	return rte_cpu_get_flag_enabled(RTE_CPUFLAG_PCLMULQDQ);
+}
+#endif
+
+#ifdef CC_ARM64_NEON_PMULL_SUPPORT
+static uint8_t
+neon_pmull_cpu_supported(void)
+{
+	return rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL);
+}
+#endif
+
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-#ifdef X86_64_SSE42_PCLMULQDQ
+#ifdef RTE_ARCH_X86_64
 	case RTE_NET_CRC_SSE42:
-		handlers = handlers_sse42;
-		break;
-#elif defined ARM64_NEON_PMULL
-		/* fall-through */
+#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
+		if (sse42_pclmulqdq_cpu_supported()) {
+			handlers = handlers_sse42;
+			break;
+		}
+#endif
+#endif /* RTE_ARCH_X86_64 */
+#ifdef RTE_ARCH_ARM64
 	case RTE_NET_CRC_NEON:
-		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+#ifdef CC_ARM64_NEON_PMULL_SUPPORT
+		if (neon_pmull_cpu_supported()) {
 			handlers = handlers_neon;
 			break;
 		}
 #endif
+#endif /* RTE_ARCH_ARM64 */
 		/* fall-through */
 	case RTE_NET_CRC_SCALAR:
 		/* fall-through */
@@ -188,11 +200,14 @@ RTE_INIT(rte_net_crc_init)
 
 	rte_net_crc_scalar_init();
 
-#ifdef X86_64_SSE42_PCLMULQDQ
-	alg = RTE_NET_CRC_SSE42;
-	rte_net_crc_sse42_init();
-#elif defined ARM64_NEON_PMULL
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
+	if (sse42_pclmulqdq_cpu_supported()) {
+		alg = RTE_NET_CRC_SSE42;
+		rte_net_crc_sse42_init();
+	}
+#endif
+#ifdef CC_ARM64_NEON_PMULL_SUPPORT
+	if (neon_pmull_cpu_supported()) {
 		alg = RTE_NET_CRC_NEON;
 		rte_net_crc_neon_init();
 	}
-- 
2.12.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC
  2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh
                   ` (3 preceding siblings ...)
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection Mairtin o Loingsigh
@ 2020-09-29 15:12 ` Mairtin o Loingsigh
  4 siblings, 0 replies; 10+ messages in thread
From: Mairtin o Loingsigh @ 2020-09-29 15:12 UTC (permalink / raw)
  To: jasvinder.singh, bruce.richardson, pablo.de.lara.guarch
  Cc: dev, brendan.ryan, david.coyle, Mairtin o Loingsigh

This patch enables the optimized calculation of CRC32-Ethernet and
CRC16-CCITT using the AVX512 and VPCLMULQDQ instruction sets. This CRC
implementation is built if the compiler supports the required instruction
sets. It is selected at run-time if the host CPU, again, supports the
required instruction sets.

Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
Signed-off-by: David Coyle <david.coyle@intel.com>
---
 app/test/test_crc.c                    |  11 +-
 config/x86/meson.build                 |   6 +-
 doc/guides/rel_notes/release_20_11.rst |   2 +
 lib/librte_net/meson.build             |  55 +++++
 lib/librte_net/net_crc.h               |  11 +
 lib/librte_net/net_crc_avx512.c        | 424 +++++++++++++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c           |  33 +++
 lib/librte_net/rte_net_crc.h           |   4 +-
 8 files changed, 542 insertions(+), 4 deletions(-)
 create mode 100644 lib/librte_net/net_crc_avx512.c

diff --git a/app/test/test_crc.c b/app/test/test_crc.c
index f8a74e04e..bf1d34435 100644
--- a/app/test/test_crc.c
+++ b/app/test/test_crc.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017 Intel Corporation
+ * Copyright(c) 2017-2020 Intel Corporation
  */
 
 #include "test.h"
@@ -149,6 +149,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC avx512 mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_AVX512);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (x86_64 AVX512): failed (%d)\n", ret);
+		return ret;
+	}
+
 	/* set CRC neon mode */
 	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
 
diff --git a/config/x86/meson.build b/config/x86/meson.build
index fea4d5403..172b72b72 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2017-2019 Intel Corporation
+# Copyright(c) 2017-2020 Intel Corporation
 
 # get binutils version for the workaround of Bug 97
 if not is_windows
@@ -23,7 +23,9 @@ endforeach
 
 optional_flags = ['AES', 'PCLMUL',
 		'AVX', 'AVX2', 'AVX512F',
-		'RDRND', 'RDSEED']
+		'RDRND', 'RDSEED',
+		'AVX512BW', 'AVX512DQ',
+		'AVX512VL', 'VPCLMULQDQ']
 foreach f:optional_flags
 	if cc.get_define('__@0@__'.format(f), args: machine_args) == '1'
 		if f == 'PCLMUL' # special case flags with different defines
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index 6bd222dca..509749ebd 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -58,6 +58,8 @@ New Features
 * **Updated CRC modules of rte_net library.**
 
   * Added run-time selection of the optimal architecture-specific CRC path.
+  * Added optimized implementations of CRC32-Ethernet and CRC16-CCITT
+    using the AVX512 and VPCLMULQDQ instruction sets.
 
 * **Updated Cisco enic driver.**
 
diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build
index b6880bd85..eeae25bc1 100644
--- a/lib/librte_net/meson.build
+++ b/lib/librte_net/meson.build
@@ -24,18 +24,62 @@ deps += ['mbuf']
 if dpdk_conf.has('RTE_ARCH_X86_64')
 	net_crc_sse42_cpu_support = \
 		cc.get_define('__PCLMUL__', args: machine_args) != ''
+	net_crc_avx512_cpu_support = \
+		cc.get_define('__AVX512F__', args: machine_args) != '' and \
+		cc.get_define('__AVX512BW__', args: machine_args) != '' and \
+		cc.get_define('__AVX512DQ__', args: machine_args) != '' and \
+		cc.get_define('__AVX512VL__', args: machine_args) != '' and \
+		cc.get_define('__VPCLMULQDQ__', args: machine_args) != ''
+
 	net_crc_sse42_cc_support = \
 		cc.has_argument('-mpclmul') and cc.has_argument('-maes')
+	net_crc_avx512_cc_support = \
+		not machine_args.contains('-mno-avx512f') and \
+		cc.has_argument('-mavx512f') and \
+		cc.has_argument('-mavx512bw') and \
+		cc.has_argument('-mavx512dq') and \
+		cc.has_argument('-mavx512vl') and \
+		cc.has_argument('-mvpclmulqdq') and \
+		cc.has_argument('-mavx2') and \
+		cc.has_argument('-mavx')
 
 	build_static_net_crc_sse42_lib = 0
+	build_static_net_crc_avx512_lib = 0
 
 	if net_crc_sse42_cpu_support == true
 		sources += files('net_crc_sse.c')
 		cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT']
+		if net_crc_avx512_cpu_support == true
+			sources += files('net_crc_avx512.c')
+			cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT']
+		elif net_crc_avx512_cc_support == true
+			build_static_net_crc_avx512_lib = 1
+			net_crc_avx512_lib_cflags = ['-mavx512f',
+							'-mavx512bw',
+							'-mavx512dq',
+							'-mavx512vl',
+							'-mvpclmulqdq',
+							'-mavx2',
+							'-mavx']
+			cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT']
+		endif
 	elif net_crc_sse42_cc_support == true
 		build_static_net_crc_sse42_lib = 1
 		net_crc_sse42_lib_cflags = ['-mpclmul', '-maes']
 		cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT']
+		if net_crc_avx512_cc_support == true
+			build_static_net_crc_avx512_lib = 1
+			net_crc_avx512_lib_cflags = ['-mpclmul',
+							'-maes',
+							'-mavx512f',
+							'-mavx512bw',
+							'-mavx512dq',
+							'-mavx512vl',
+							'-mvpclmulqdq',
+							'-mavx2',
+							'-mavx']
+			cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT']
+		endif
 	endif
 
 	if build_static_net_crc_sse42_lib == 1
@@ -47,6 +91,17 @@ if dpdk_conf.has('RTE_ARCH_X86_64')
 						net_crc_sse42_lib_cflags])
 		objs += net_crc_sse42_lib.extract_objects('net_crc_sse.c')
 	endif
+
+	if build_static_net_crc_avx512_lib == 1
+		net_crc_avx512_lib = static_library(
+					'net_crc_avx512_lib',
+					'net_crc_avx512.c',
+					dependencies: static_rte_eal,
+					c_args: [cflags,
+						net_crc_avx512_lib_cflags])
+		objs += net_crc_avx512_lib.extract_objects('net_crc_avx512.c')
+	endif
+
 elif dpdk_conf.has('RTE_ARCH_ARM64') and \
 		cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != ''
 	sources += files('net_crc_neon.c')
diff --git a/lib/librte_net/net_crc.h b/lib/librte_net/net_crc.h
index a1578a56c..7a74d5406 100644
--- a/lib/librte_net/net_crc.h
+++ b/lib/librte_net/net_crc.h
@@ -20,6 +20,17 @@ rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len);
 uint32_t
 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len);
 
+/* AVX512 */
+
+void
+rte_net_crc_avx512_init(void);
+
+uint32_t
+rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len);
+
+uint32_t
+rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len);
+
 /* NEON */
 
 void
diff --git a/lib/librte_net/net_crc_avx512.c b/lib/librte_net/net_crc_avx512.c
new file mode 100644
index 000000000..81aac6349
--- /dev/null
+++ b/lib/librte_net/net_crc_avx512.c
@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+#include <rte_cpuflags.h>
+
+#include "net_crc.h"
+
+#include <x86intrin.h>
+
+/* VPCLMULQDQ CRC computation context structure */
+struct crc_vpclmulqdq_ctx {
+	__m512i rk1_rk2;
+	__m512i rk3_rk4;
+	__m512i fold_7x128b;
+	__m512i fold_3x128b;
+	__m128i rk5_rk6;
+	__m128i rk7_rk8;
+	__m128i fold_1x128b;
+};
+
+static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
+static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);
+
+static uint16_t byte_len_to_mask_table[] = {
+	0x0000, 0x0001, 0x0003, 0x0007,
+	0x000f, 0x001f, 0x003f, 0x007f,
+	0x00ff, 0x01ff, 0x03ff, 0x07ff,
+	0x0fff, 0x1fff, 0x3fff, 0x7fff,
+	0xffff};
+
+static const uint8_t shf_table[32] __rte_aligned(16) = {
+	0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+
+static const uint32_t mask[4] __rte_aligned(16) = {
+	0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+};
+
+static const uint32_t mask2[4] __rte_aligned(16) = {
+	0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+static __rte_always_inline __m512i
+crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
+{
+	__m512i tmp0, tmp1;
+
+	tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
+	tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
+
+	return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
+}
+
+static __rte_always_inline __m128i
+crc32_fold_128(__m512i fold0, __m512i fold1,
+	const struct crc_vpclmulqdq_ctx *params)
+{
+	__m128i res, res2;
+	__m256i a;
+	__m512i tmp0, tmp1, tmp2, tmp3;
+	__m512i tmp4;
+
+	tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
+	tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
+
+	res = _mm512_extracti64x2_epi64(fold1, 3);
+	tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
+
+	tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
+	tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
+
+	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
+	tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
+
+	tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
+
+	a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
+	res = _mm256_extracti64x2_epi64(a, 1);
+	res2 = _mm_xor_si128(res, *(__m128i *)&a);
+
+	return res2;
+}
+
+static __rte_always_inline __m128i
+last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
+	const struct crc_vpclmulqdq_ctx *params)
+{
+	uint32_t offset;
+	__m128i res2, res3, res4, pshufb_shf;
+
+	const uint32_t mask3[4] __rte_aligned(16) = {
+		   0x80808080, 0x80808080, 0x80808080, 0x80808080
+	};
+
+	res2 = res;
+	offset = data_len - n;
+	res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
+
+	pshufb_shf = _mm_loadu_si128((const __m128i *)
+			(shf_table + (data_len-n)));
+
+	res = _mm_shuffle_epi8(res, pshufb_shf);
+	pshufb_shf = _mm_xor_si128(pshufb_shf,
+			_mm_load_si128((const __m128i *) mask3));
+	res2 = _mm_shuffle_epi8(res2, pshufb_shf);
+
+	res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
+
+	res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
+	res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
+	res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
+
+	return res;
+}
+
+static __rte_always_inline __m128i
+done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
+{
+	__m128i res1;
+
+	res1 = res;
+
+	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
+	res1 = _mm_srli_si128(res1, 8);
+	res = _mm_xor_si128(res, res1);
+
+	res1 = res;
+	res = _mm_slli_si128(res, 4);
+	res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
+	res = _mm_xor_si128(res, res1);
+
+	return res;
+}
+
+static __rte_always_inline uint32_t
+barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
+{
+	__m128i tmp0, tmp1;
+
+	data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
+	tmp0 = data64;
+	tmp1 = data64;
+
+	data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
+	data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
+			0x28);
+
+	tmp1 = data64;
+	data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
+	data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
+
+	return _mm_extract_epi32(data64, 2);
+}
+
+static __rte_always_inline void
+reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
+	const struct crc_vpclmulqdq_ctx *params)
+{
+	__m128i tmp, tmp1;
+
+	tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
+	*fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
+	*fold = _mm_xor_si128(*fold, tmp);
+	tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
+	*fold = _mm_xor_si128(*fold, tmp1);
+	*n += 16;
+	*len -= 16;
+}
+
+static __rte_always_inline uint32_t
+crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
+	const struct crc_vpclmulqdq_ctx *params)
+{
+	__m128i res, d;
+	__m256i b;
+	__m512i temp, k;
+	__m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
+	__m512i fold0, fold1, fold2, fold3;
+	__mmask16 mask;
+	uint32_t n = 0;
+	int reduction = 0;
+
+	/* Get CRC init value */
+	b = _mm256_insert_epi32(_mm256_setzero_si256(), crc, 0);
+	temp = _mm512_inserti32x8(_mm512_setzero_si512(), b, 0);
+
+	if (data_len > 255) {
+		fold0 = _mm512_loadu_si512((const __m512i *)data);
+		fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
+		fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
+		fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
+		fold0 = _mm512_xor_si512(fold0, temp);
+
+		/* Main folding loop */
+		k = params->rk1_rk2;
+		for (n = 256; (n + 256) <= data_len; n += 256) {
+			qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
+			qw1 = _mm512_loadu_si512((const __m512i *)
+					&(data[n+64]));
+			qw2 = _mm512_loadu_si512((const __m512i *)
+					&(data[n+128]));
+			qw3 = _mm512_loadu_si512((const __m512i *)
+					&(data[n+192]));
+			fold0 = crcr32_folding_round(qw0, k, fold0);
+			fold1 = crcr32_folding_round(qw1, k, fold1);
+			fold2 = crcr32_folding_round(qw2, k, fold2);
+			fold3 = crcr32_folding_round(qw3, k, fold3);
+		}
+
+		/* 256 to 128 fold */
+		k = params->rk3_rk4;
+		fold0 = crcr32_folding_round(fold2, k, fold0);
+		fold1 = crcr32_folding_round(fold3, k, fold1);
+
+		res = crc32_fold_128(fold0, fold1, params);
+
+		reduction = 240 - ((n+256)-data_len);
+
+		while (reduction > 0)
+			reduction_loop(&res, &reduction, data, &n,
+					params);
+
+		reduction += 16;
+
+		if (n != data_len)
+			res = last_two_xmm(data, data_len, n, res,
+					params);
+	} else {
+		if (data_len > 31) {
+			res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
+			d = _mm_loadu_si128((const __m128i *)data);
+			res = _mm_xor_si128(res, d);
+			n += 16;
+
+			reduction = 240 - ((n+256)-data_len);
+
+			while (reduction > 0)
+				reduction_loop(&res, &reduction, data, &n,
+						params);
+
+			if (n != data_len)
+				res = last_two_xmm(data, data_len, n, res,
+						params);
+		} else if (data_len > 16) {
+			res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
+			d = _mm_loadu_si128((const __m128i *)data);
+			res = _mm_xor_si128(res, d);
+			n += 16;
+
+			if (n != data_len)
+				res = last_two_xmm(data, data_len, n, res,
+						params);
+		} else if (data_len == 16) {
+			res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
+			d = _mm_loadu_si128((const __m128i *)data);
+			res = _mm_xor_si128(res, d);
+		} else {
+			res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
+			mask = byte_len_to_mask_table[data_len];
+			d = _mm_maskz_loadu_epi8(mask, data);
+			res = _mm_xor_si128(res, d);
+
+			if (data_len > 3) {
+				d = _mm_loadu_si128((const __m128i *)
+						&shf_table[data_len]);
+				res = _mm_shuffle_epi8(res, d);
+			} else if (data_len > 2) {
+				res = _mm_slli_si128(res, 5);
+				goto do_barrett_reduction;
+			} else if (data_len > 1) {
+				res = _mm_slli_si128(res, 6);
+				goto do_barrett_reduction;
+			} else if (data_len > 0) {
+				res = _mm_slli_si128(res, 7);
+				goto do_barrett_reduction;
+			} else {
+				/* zero length case */
+				return crc;
+			}
+		}
+	}
+
+	res = done_128(res, params);
+
+do_barrett_reduction:
+	n = barrett_reduction(res, params);
+
+	return n;
+}
+
+static void
+crc32_load_init_constants(void)
+{
+	__m128i a;
+	/* fold constants */
+	uint64_t c0 = 0x00000000e95c1271;
+	uint64_t c1 = 0x00000000ce3371cb;
+	uint64_t c2 = 0x00000000910eeec1;
+	uint64_t c3 = 0x0000000033fff533;
+	uint64_t c4 = 0x000000000cbec0ed;
+	uint64_t c5 = 0x0000000031f8303f;
+	uint64_t c6 = 0x0000000057c54819;
+	uint64_t c7 = 0x00000000df068dc2;
+	uint64_t c8 = 0x00000000ae0b5394;
+	uint64_t c9 = 0x000000001c279815;
+	uint64_t c10 = 0x000000001d9513d7;
+	uint64_t c11 = 0x000000008f352d95;
+	uint64_t c12 = 0x00000000af449247;
+	uint64_t c13 = 0x000000003db1ecdc;
+	uint64_t c14 = 0x0000000081256527;
+	uint64_t c15 = 0x00000000f1da05aa;
+	uint64_t c16 = 0x00000000ccaa009e;
+	uint64_t c17 = 0x00000000ae689191;
+	uint64_t c18 = 0x00000000ccaa009e;
+	uint64_t c19 = 0x00000000b8bc6765;
+	uint64_t c20 = 0x00000001f7011640;
+	uint64_t c21 = 0x00000001db710640;
+
+	a = _mm_set_epi64x(c1, c0);
+	crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
+
+	a = _mm_set_epi64x(c3, c2);
+	crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
+
+	crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
+			c9, c10, c11);
+	crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
+			c16, c17, 0, 0);
+	crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
+			_mm_cvtsi64_m64(c17));
+
+	crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
+			_mm_cvtsi64_m64(c19));
+	crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
+			_mm_cvtsi64_m64(c21));
+}
+
+static void
+crc16_load_init_constants(void)
+{
+	__m128i a;
+	/* fold constants */
+	uint64_t c0 = 0x0000000000009a19;
+	uint64_t c1 = 0x0000000000002df8;
+	uint64_t c2 = 0x00000000000068af;
+	uint64_t c3 = 0x000000000000b6c9;
+	uint64_t c4 = 0x000000000000c64f;
+	uint64_t c5 = 0x000000000000cd95;
+	uint64_t c6 = 0x000000000000d341;
+	uint64_t c7 = 0x000000000000b8f2;
+	uint64_t c8 = 0x0000000000000842;
+	uint64_t c9 = 0x000000000000b072;
+	uint64_t c10 = 0x00000000000047e3;
+	uint64_t c11 = 0x000000000000922d;
+	uint64_t c12 = 0x0000000000000e3a;
+	uint64_t c13 = 0x0000000000004d7a;
+	uint64_t c14 = 0x0000000000005b44;
+	uint64_t c15 = 0x0000000000007762;
+	uint64_t c16 = 0x00000000000081bf;
+	uint64_t c17 = 0x0000000000008e10;
+	uint64_t c18 = 0x00000000000081bf;
+	uint64_t c19 = 0x0000000000001cbb;
+	uint64_t c20 = 0x000000011c581910;
+	uint64_t c21 = 0x0000000000010810;
+
+	a = _mm_set_epi64x(c1, c0);
+	crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
+
+	a = _mm_set_epi64x(c3, c2);
+	crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
+
+	crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
+			c9, c10, c11);
+	crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
+			c16, c17, 0, 0);
+	crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
+			_mm_cvtsi64_m64(c17));
+
+	crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
+			_mm_cvtsi64_m64(c19));
+	crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
+			_mm_cvtsi64_m64(c21));
+}
+
+void
+rte_net_crc_avx512_init(void)
+{
+	crc32_load_init_constants();
+	crc16_load_init_constants();
+
+	/*
+	 * Reset the register as following calculation may
+	 * use other data types such as float, double, etc.
+	 */
+	_mm_empty();
+}
+
+uint32_t
+rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
+{
+	/* return 16-bit CRC value */
+	return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt);
+}
+
+uint32_t
+rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
+{
+	/* return 32-bit CRC value */
+	return ~crc32_eth_calc_vpclmulqdq(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth);
+}
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 83dccbfba..fcf9cc0ef 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -37,6 +37,12 @@ static rte_net_crc_handler handlers_scalar[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_handler,
 };
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+static rte_net_crc_handler handlers_avx512[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_avx512_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_avx512_handler,
+};
+#endif
 #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
 static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
@@ -132,6 +138,19 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
 		crc32_eth_lut);
 }
 
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+static uint8_t
+avx512_vpclmulqdq_cpu_supported(void)
+{
+	return rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512DQ) &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_PCLMULQDQ) &&
+		rte_cpu_get_flag_enabled(RTE_CPUFLAG_VPCLMULQDQ);
+}
+#endif
+
 #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
 static uint8_t
 sse42_pclmulqdq_cpu_supported(void)
@@ -153,6 +172,14 @@ rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
 #ifdef RTE_ARCH_X86_64
+	case RTE_NET_CRC_AVX512:
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+		if (avx512_vpclmulqdq_cpu_supported()) {
+			handlers = handlers_avx512;
+			break;
+		}
+#endif
+		/* fall-through */
 	case RTE_NET_CRC_SSE42:
 #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
 		if (sse42_pclmulqdq_cpu_supported()) {
@@ -206,6 +233,12 @@ RTE_INIT(rte_net_crc_init)
 		rte_net_crc_sse42_init();
 	}
 #endif
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+	if (avx512_vpclmulqdq_cpu_supported()) {
+		alg = RTE_NET_CRC_AVX512;
+		rte_net_crc_avx512_init();
+	}
+#endif
 #ifdef CC_ARM64_NEON_PMULL_SUPPORT
 	if (neon_pmull_cpu_supported()) {
 		alg = RTE_NET_CRC_NEON;
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 16e85ca97..72d3e10ff 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017 Intel Corporation
+ * Copyright(c) 2017-2020 Intel Corporation
  */
 
 #ifndef _RTE_NET_CRC_H_
@@ -23,6 +23,7 @@ enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
 	RTE_NET_CRC_NEON,
+	RTE_NET_CRC_AVX512,
 };
 
 /**
@@ -35,6 +36,7 @@ enum rte_net_crc_alg {
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
  *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
+ *   - RTE_NET_CRC_AVX512 (Use 512-bit AVX intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.12.3


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC
  2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
@ 2020-09-29 15:41   ` O'loingsigh, Mairtin
  0 siblings, 0 replies; 10+ messages in thread
From: O'loingsigh, Mairtin @ 2020-09-29 15:41 UTC (permalink / raw)
  To: Singh, Jasvinder, Richardson, Bruce, De Lara Guarch, Pablo
  Cc: dev, Ryan, Brendan, Coyle, David

Hi,

> -----Original Message-----
> From: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>
> Sent: Tuesday, September 29, 2020 4:13 PM
> To: Singh, Jasvinder <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>
> Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, David
> <david.coyle@intel.com>; O'loingsigh, Mairtin
> <mairtin.oloingsigh@intel.com>
> Subject: [PATCH v2 0/2] net: add CRC run-time checks and
> AVX512/VPCLMULQDQ based CRC
> 
> This patchset makes two significant enhancements to the CRC modules of
> the rte_net library:
> 
> 1) Adds run-time selection of the optimal architecture-specific CRC path.
>    Previously the selection was solely made at compile-time, meaning it
>    could only be built and run on the same generation of CPU. Adding
>    run-time selection ability means this can be used from distro packages
>    and/or DPDK can be compiled on an older CPU and run on a newer CPU.
> 2) Adds an optimized CRC implementation based on the AVX512 and
>    VPCLMULQDQ instruction sets.
> 
> For further details, please see the commit messages of the individual
> patches.
> 
> v2:
> * Added support for run-time selection of optimal architecture-specific
>   CRC, based on v1 review comment.
> * Added full working AVX512/VPCLMULDQD support for CRC32-Ethernet and
>   CRC16-CCITT.
> 
> v1:
> * Initial version, with incomplete AVX512/VPCLMULDQD support for
>   CRC32-Ethernet only.
> 
> Mairtin o Loingsigh (2):
>   net: add run-time architecture specific CRC selection
>   net: add support for AVX512/VPCLMULQDQ based CRC
> 
>  app/test/test_crc.c                               |  11 +-
>  config/x86/meson.build                            |   6 +-
>  doc/guides/rel_notes/release_20_11.rst            |   6 +
>  lib/librte_net/meson.build                        |  89 ++++-
>  lib/librte_net/net_crc.h                          |  45 +++
>  lib/librte_net/net_crc_avx512.c                   | 424 ++++++++++++++++++++++
>  lib/librte_net/{net_crc_neon.h => net_crc_neon.c} |  27 +-
>  lib/librte_net/{net_crc_sse.h => net_crc_sse.c}   |  34 +-
>  lib/librte_net/rte_net_crc.c                      | 100 +++--
>  lib/librte_net/rte_net_crc.h                      |   4 +-
>  10 files changed, 674 insertions(+), 72 deletions(-)  create mode 100644
> lib/librte_net/net_crc.h  create mode 100644 lib/librte_net/net_crc_avx512.c
> rename lib/librte_net/{net_crc_neon.h => net_crc_neon.c} (95%)  rename
> lib/librte_net/{net_crc_sse.h => net_crc_sse.c} (94%)
> 
> --
> 2.12.3


I encountered a problem submitting the v2 patch and have submitted a v3

Regards,
Mairtin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC
  2020-09-10 12:27 ` Bruce Richardson
  2020-09-10 12:52   ` O'loingsigh, Mairtin
@ 2020-09-29 15:45   ` O'loingsigh, Mairtin
  1 sibling, 0 replies; 10+ messages in thread
From: O'loingsigh, Mairtin @ 2020-09-29 15:45 UTC (permalink / raw)
  To: Richardson, Bruce
  Cc: Singh, Jasvinder, dev, Ryan, Brendan, Coyle, David,
	De Lara Guarch, Pablo

Hi,

> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Thursday, September 10, 2020 1:28 PM
> To: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>
> Cc: Singh, Jasvinder <jasvinder.singh@intel.com>; dev@dpdk.org; Ryan,
> Brendan <brendan.ryan@intel.com>; Coyle, David <david.coyle@intel.com>;
> De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] net: add support for AVX512 when
> generating CRC
> 
> On Thu, Sep 10, 2020 at 01:01:11PM +0100, Mairtin o Loingsigh wrote:
> > This patch enables the generation of CRC using AVX512 instruction set
> > when available on the host platform.
> >
> > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
> > ---
> >
> > v1:
> > * Initial version, with AVX512 support for CRC32 Ethernet only
> > (requires further updates)
> >   * AVX512 support for CRC16-CCITT and final implementation of
> >     CRC32 Ethernet will be added in v2
> > ---
> >  doc/guides/rel_notes/release_20_11.rst |    4 +
> >  lib/librte_net/net_crc_avx.h           |  331
> ++++++++++++++++++++++++++++++++
> >  lib/librte_net/rte_net_crc.c           |   23 ++-
> >  lib/librte_net/rte_net_crc.h           |    1 +
> >  4 files changed, 358 insertions(+), 1 deletions(-)  create mode
> > 100644 lib/librte_net/net_crc_avx.h
> >
> <snip>
> > --- a/lib/librte_net/rte_net_crc.c
> > +++ b/lib/librte_net/rte_net_crc.c
> > @@ -10,12 +10,18 @@
> >  #include <rte_common.h>
> >  #include <rte_net_crc.h>
> >
> > -#if defined(RTE_ARCH_X86_64) &&
> > defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
> > +#if defined(RTE_ARCH_X86_64) &&
> defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \
> > +	&& defined(RTE_MACHINE_CPUFLAG_AVX512F)
> > +#define X86_64_AVX512F_PCLMULQDQ     1
> > +#elif defined(RTE_ARCH_X86_64) &&
> > +defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
> 
> This all seems to be build-time selection of path. Can you perhaps investigate
> adding runtime selection instead, so that this can be used from distro
> packages, or DPDK compiled on older systems but used on newer.
> See also patchset: http://patches.dpdk.org/project/dpdk/list/?series=11831
> which is relevant to this too.
> 
> /Bruce

We have added runtime check for v3 of patch which we have submitted

Mairtin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC
  2020-09-11  9:57 ` De Lara Guarch, Pablo
@ 2020-09-29 15:47   ` O'loingsigh, Mairtin
  0 siblings, 0 replies; 10+ messages in thread
From: O'loingsigh, Mairtin @ 2020-09-29 15:47 UTC (permalink / raw)
  To: De Lara Guarch, Pablo, Singh, Jasvinder; +Cc: dev, Ryan, Brendan, Coyle, David

Hi,

> -----Original Message-----
> From: De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> Sent: Friday, September 11, 2020 10:58 AM
> To: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>; Singh, Jasvinder
> <jasvinder.singh@intel.com>
> Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, David
> <david.coyle@intel.com>
> Subject: RE: [PATCH] net: add support for AVX512 when generating CRC
> 
> Hi Mairtin,
> 
> > -----Original Message-----
> > From: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>
> > Sent: Thursday, September 10, 2020 1:01 PM
> > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle,
> David
> > <david.coyle@intel.com>; De Lara Guarch, Pablo
> > <pablo.de.lara.guarch@intel.com>; O'loingsigh, Mairtin
> > <mairtin.oloingsigh@intel.com>
> > Subject: [PATCH] net: add support for AVX512 when generating CRC
> >
> > This patch enables the generation of CRC using AVX512 instruction set
> > when available on the host platform.
> >
> > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
> > ---
> >
> > v1:
> > * Initial version, with AVX512 support for CRC32 Ethernet only
> > (requires further
> > updates)
> >   * AVX512 support for CRC16-CCITT and final implementation of
> >     CRC32 Ethernet will be added in v2
> > ---
> >  doc/guides/rel_notes/release_20_11.rst |    4 +
> >  lib/librte_net/net_crc_avx.h           |  331
> ++++++++++++++++++++++++++++++++
> >  lib/librte_net/rte_net_crc.c           |   23 ++-
> >  lib/librte_net/rte_net_crc.h           |    1 +
> >  4 files changed, 358 insertions(+), 1 deletions(-)  create mode
> > 100644 lib/librte_net/net_crc_avx.h
> >
> > diff --git a/doc/guides/rel_notes/release_20_11.rst
> > b/doc/guides/rel_notes/release_20_11.rst
> > index df227a1..d6a84ca 100644
> > --- a/doc/guides/rel_notes/release_20_11.rst
> > +++ b/doc/guides/rel_notes/release_20_11.rst
> > @@ -55,6 +55,10 @@ New Features
> >       Also, make sure to start the actual text at the margin.
> >       =======================================================
> >
> > +* **Added support for AVX512 in rte_net CRC calculations.**
> > +
> > +  Added new CRC32 calculation code using AVX512 instruction set
> > + Added new CRC16-CCITT calculation code using AVX512 instruction set
> >
> >  Removed Items
> >  -------------
> > diff --git a/lib/librte_net/net_crc_avx.h
> > b/lib/librte_net/net_crc_avx.h new file mode 100644 index
> > 0000000..d9481d5
> > --- /dev/null
> > +++ b/lib/librte_net/net_crc_avx.h
> 
> ...
> 
> > +static __rte_always_inline uint32_t
> > +crc32_eth_calc_pclmulqdq(
> > +	const uint8_t *data,
> > +	uint32_t data_len,
> > +	uint32_t crc,
> > +	const struct crc_pclmulqdq512_ctx *params) {
> > +	__m256i b;
> > +	__m512i temp, k;
> > +	__m512i qw0 = _mm512_set1_epi64(0);
> > +	__m512i fold0;
> > +	uint32_t n;
> 
> This is loading 64 bytes of data, but if seems like only 16 are available, right?
> Should we use _mm_loadu_si128?
> 
> > +			fold0 = _mm512_xor_si512(fold0, temp);
> > +			goto reduction_128_64;
> > +		}
> > +
> > +		if (unlikely(data_len < 16)) {
> > +			/* 0 to 15 bytes */
> > +			uint8_t buffer[16] __rte_aligned(16);
> > +
> > +			memset(buffer, 0, sizeof(buffer));
> > +			memcpy(buffer, data, data_len);
> 
> I would use _mm_maskz_loadu_epi8, passing a mask register with ((1 <<
> data_len) - 1).
> 
> > +
> > +			fold0 = _mm512_load_si512((const __m128i
> *)buffer);
> > +			fold0 = _mm512_xor_si512(fold0, temp);
> > +			if (unlikely(data_len < 4)) {
> > +				fold0 = xmm_shift_left(fold0, 8 - data_len);
> > +				goto barret_reduction;
> > +			}
> > +			fold0 = xmm_shift_left(fold0, 16 - data_len);
> > +			goto reduction_128_64;
> > +		}
> > +		/* 17 to 31 bytes */
> > +		fold0 = _mm512_loadu_si512((const __m512i *)data);
> 
> Same here. Looks like you are loading too much data?
> 
> > +		fold0 = _mm512_xor_si512(fold0, temp);
> > +		n = 16;
> > +		k = params->rk1_rk2;
> > +		goto partial_bytes;
> > +	}
> 
> ...
> 
> > +
> > +		fold0 = _mm512_xor_si512(fold0, temp);
> > +		fold0 = _mm512_xor_si512(fold0, b);
> 
> You could use _mm512_ternarylogic_epi64 with 0x96 as to do 2x XORs in one
> instruction.
> 
> > +	}
> > +
> > +	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
> > +reduction_128_64:
> > +	k = params->rk5_rk6;
> > +
> > +barret_reduction:
> > +	k = params->rk7_rk8;
> > +	n = crcr32_reduce_64_to_32(fold0, k);
> > +
> > +	return n;
> > +}
> > +
> > +

The latest version of this patch (v3) reworks a lot of this code and address the issues noted above

Mairtin

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-09-29 15:47 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh
2020-09-10 12:27 ` Bruce Richardson
2020-09-10 12:52   ` O'loingsigh, Mairtin
2020-09-29 15:45   ` O'loingsigh, Mairtin
2020-09-11  9:57 ` De Lara Guarch, Pablo
2020-09-29 15:47   ` O'loingsigh, Mairtin
2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
2020-09-29 15:41   ` O'loingsigh, Mairtin
2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection Mairtin o Loingsigh
2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).