* [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC @ 2020-09-10 12:01 Mairtin o Loingsigh 2020-09-10 12:27 ` Bruce Richardson ` (4 more replies) 0 siblings, 5 replies; 10+ messages in thread From: Mairtin o Loingsigh @ 2020-09-10 12:01 UTC (permalink / raw) To: jasvinder.singh Cc: dev, brendan.ryan, David.Coyle, pablo.de.lara.guarch, Mairtin o Loingsigh This patch enables the generation of CRC using AVX512 instruction set when available on the host platform. Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> --- v1: * Initial version, with AVX512 support for CRC32 Ethernet only (requires further updates) * AVX512 support for CRC16-CCITT and final implementation of CRC32 Ethernet will be added in v2 --- doc/guides/rel_notes/release_20_11.rst | 4 + lib/librte_net/net_crc_avx.h | 331 ++++++++++++++++++++++++++++++++ lib/librte_net/rte_net_crc.c | 23 ++- lib/librte_net/rte_net_crc.h | 1 + 4 files changed, 358 insertions(+), 1 deletions(-) create mode 100644 lib/librte_net/net_crc_avx.h diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst index df227a1..d6a84ca 100644 --- a/doc/guides/rel_notes/release_20_11.rst +++ b/doc/guides/rel_notes/release_20_11.rst @@ -55,6 +55,10 @@ New Features Also, make sure to start the actual text at the margin. ======================================================= +* **Added support for AVX512 in rte_net CRC calculations.** + + Added new CRC32 calculation code using AVX512 instruction set + Added new CRC16-CCITT calculation code using AVX512 instruction set Removed Items ------------- diff --git a/lib/librte_net/net_crc_avx.h b/lib/librte_net/net_crc_avx.h new file mode 100644 index 0000000..d9481d5 --- /dev/null +++ b/lib/librte_net/net_crc_avx.h @@ -0,0 +1,331 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _RTE_NET_CRC_AVX_H_ +#define _RTE_NET_CRC_AVX_H_ + +#include <rte_branch_prediction.h> + +#include <rte_vect.h> +#include <immintrin.h> +#include <x86intrin.h> +#include <cpuid.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** PCLMULQDQ CRC computation context structure */ +struct crc_pclmulqdq512_ctx { + __m512i rk1_rk2; + __m512i rk3_rk4; + __m512i rk5_rk6; + __m512i rk7_rk8; +}; + +static struct crc_pclmulqdq512_ctx crc32_eth_pclmulqdq __rte_aligned(16); + +/** + * @brief Performs one folding round + * + * Logically function operates as follows: + * DATA = READ_NEXT_64BYTES(); + * F1 = LSB8(FOLD) + * F2 = MSB8(FOLD) + * T1 = CLMUL(F1, RK1) + * T2 = CLMUL(F2, RK2) + * FOLD = XOR(T1, T2, DATA) + * + * @param data_block + * 64 byte data block + * @param precomp + * Precomputed rk1 constant + * @param fold + * Current16 byte folded data + * + * @return + * New 16 byte folded data + */ +static __rte_always_inline __m512i +crcr32_folding_round(__m512i data_block, + __m512i precomp, + __m512i fold) +{ + __m512i tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01); + __m512i tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10); + + return _mm512_xor_si512(tmp1, _mm512_xor_si512(data_block, tmp0)); +} + +/** + * Performs reduction from 128 bits to 64 bits + * + * @param data128 + * 128 bits data to be reduced + * @param precomp + * precomputed constants rk5, rk6 + * + * @return + * 64 bits reduced data + */ + +static __rte_always_inline __m128i +crcr32_reduce_128_to_64(__m128i data128, __m128i precomp) +{ + __m128i tmp0, tmp1, tmp2; + + /* 64b fold */ + tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00); + tmp1 = _mm_srli_si128(data128, 8); + tmp0 = _mm_xor_si128(tmp0, tmp1); + + /* 32b fold */ + tmp2 = _mm_slli_si128(tmp0, 4); + tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10); + + return _mm_xor_si128(tmp1, tmp0); +} + +/** + * Performs Barret's reduction from 64 bits to 32 bits + * + * @param data64 + * 64 bits data to be reduced + * @param precomp + * rk7 precomputed constant + * + * @return + * reduced 32 bits data + */ + +static __rte_always_inline uint32_t +crcr32_reduce_64_to_32(__m512i data64, __m512i precomp) +{ + static const uint32_t mask1[4] __rte_aligned(64) = { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + }; + + static const uint32_t mask2[4] __rte_aligned(64) = { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff + }; + __m512i tmp0, tmp1, tmp2; + + tmp0 = _mm512_and_si512(data64, _mm512_load_si512( + (const __m512i *)mask2)); + + tmp1 = _mm512_clmulepi64_epi128(tmp0, precomp, 0x00); + tmp1 = _mm512_xor_si512(tmp1, tmp0); + tmp1 = _mm512_and_si512(tmp1, _mm512_load_si512( + (const __m128i *)mask1)); + + tmp2 = _mm512_clmulepi64_epi128(tmp1, precomp, 0x10); + tmp2 = _mm512_xor_si512(tmp2, tmp1); + tmp2 = _mm512_xor_si512(tmp2, tmp0); + + return 0; +} + +static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(64) = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** + * Shifts left 128 bit register by specified number of bytes + * + * @param reg + * 128 bit value + * @param num + * number of bytes to shift left reg by (0-16) + * + * @return + * reg << (num * 8) + */ + +static __rte_always_inline __m512i +xmm_shift_left(__m512i reg, const unsigned int num) +{ + const __m512i *p = (const __m512i *)(crc_xmm_shift_tab + 16 - num); + + /* TODO: Check unaligned load*/ + return _mm512_shuffle_epi8(reg, _mm512_load_si512(p)); +} + +static __rte_always_inline uint32_t +crc32_eth_calc_pclmulqdq( + const uint8_t *data, + uint32_t data_len, + uint32_t crc, + const struct crc_pclmulqdq512_ctx *params) +{ + __m256i b; + __m512i temp, k; + __m512i qw0 = _mm512_set1_epi64(0); + __m512i fold0; + uint32_t n; + + /* Get CRC init value */ + b = _mm256_insert_epi32(_mm256_setzero_si256(), crc, 0); + temp = _mm512_inserti32x8(_mm512_setzero_si512(), b, 0); + + /* align data to 16B*/ + if (unlikely(data_len < 64)) { + if (unlikely(data_len == 16)) { + /* 16 bytes */ + /* TODO: Unaligned load not working */ + fold0 = _mm512_load_epi64((const __m512i *)data); + fold0 = _mm512_xor_si512(fold0, temp); + goto reduction_128_64; + } + + if (unlikely(data_len < 16)) { + /* 0 to 15 bytes */ + uint8_t buffer[16] __rte_aligned(16); + + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, data, data_len); + + fold0 = _mm512_load_si512((const __m128i *)buffer); + fold0 = _mm512_xor_si512(fold0, temp); + if (unlikely(data_len < 4)) { + fold0 = xmm_shift_left(fold0, 8 - data_len); + goto barret_reduction; + } + fold0 = xmm_shift_left(fold0, 16 - data_len); + goto reduction_128_64; + } + /* 17 to 31 bytes */ + fold0 = _mm512_loadu_si512((const __m512i *)data); + fold0 = _mm512_xor_si512(fold0, temp); + n = 16; + k = params->rk1_rk2; + goto partial_bytes; + } + + /*Loop of folds*/ + /** At least 32 bytes in the buffer */ + /** Apply CRC initial value */ + fold0 = _mm512_loadu_si512((const __m512i *)data); + fold0 = _mm512_xor_si512(fold0, temp); + + /** Main folding loop - the last 32 bytes is processed separately */ + k = params->rk1_rk2; + for (n = 64; (n + 64) <= data_len; n += 64) { + qw0 = _mm512_loadu_si512((const __m512i *)&data[n]); + fold0 = crcr32_folding_round(qw0, k, fold0); + } + + /* 256 to 128 fold */ + /* Check this */ + k = params->rk3_rk4; + fold0 = crcr32_folding_round(temp, k, fold0); + n += 64; + + /* Remainder */ +partial_bytes: + if (likely(n < data_len)) { + + const uint32_t mask3[4] __rte_aligned(16) = { + 0x80808080, 0x80808080, 0x80808080, 0x80808080 + }; + + const uint8_t shf_table[32] __rte_aligned(16) = { + 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; + + __m128i last16; + __m512i a, b; + + last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]); + + RTE_SET_USED(last16); + + temp = _mm512_loadu_si512((const __m512i *) + &shf_table[data_len & 15]); + a = _mm512_shuffle_epi8(fold0, temp); + + temp = _mm512_xor_si512(temp, + _mm512_load_si512((const __m512i *)mask3)); + b = _mm512_shuffle_epi8(fold0, temp); + + /* k = rk1 & rk2 */ + temp = _mm512_clmulepi64_epi128(a, k, 0x01); + fold0 = _mm512_clmulepi64_epi128(a, k, 0x10); + + fold0 = _mm512_xor_si512(fold0, temp); + fold0 = _mm512_xor_si512(fold0, b); + } + + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ +reduction_128_64: + k = params->rk5_rk6; + +barret_reduction: + k = params->rk7_rk8; + n = crcr32_reduce_64_to_32(fold0, k); + + return n; +} + + +static inline void +rte_net_crc_avx512_init(void) +{ + __m128i a; + uint64_t k1, k2, k3, k4, k5, k6; + uint64_t p = 0, q = 0; + + /** Initialize CRC32 data */ + /* 256 fold constants*/ + k1 = 0xe95c1271LLU; + k2 = 0xce3371cbLLU; + + /*256 - 128 fold constants */ + k3 = 0x910eeec1LLU; + k4 = 0x33fff533LLU; + + k5 = 0xccaa009eLLU; + k6 = 0x163cd6124LLU; + q = 0x1f7011640LLU; + p = 0x1db710641LLU; + + /** Save the params in context structure */ + a = _mm_set_epi64x(k2, k1); + crc32_eth_pclmulqdq.rk1_rk2 = _mm512_broadcast_i32x4(a); + crc32_eth_pclmulqdq.rk3_rk4 = _mm512_setr_epi64( + k3, k4, 0, 0, 0, 0, 0, 0); + crc32_eth_pclmulqdq.rk5_rk6 = _mm512_setr_epi64( + k5, k6, 0, 0, 0, 0, 0, 0); + crc32_eth_pclmulqdq.rk7_rk8 = _mm512_setr_epi64( + q, p, 0, 0, 0, 0, 0, 0); + /** + * Reset the register as following calculation may + * use other data types such as float, double, etc. + */ + _mm_empty(); + +} + +static inline uint32_t +rte_crc32_eth_avx512_handler(const uint8_t *data, + uint32_t data_len) +{ + return ~crc32_eth_calc_pclmulqdq(data, + data_len, + 0xffffffffUL, + &crc32_eth_pclmulqdq); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_NET_CRC_AVX_H_ */ diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c index 9fd4794..b2b2bc1 100644 --- a/lib/librte_net/rte_net_crc.c +++ b/lib/librte_net/rte_net_crc.c @@ -10,12 +10,18 @@ #include <rte_common.h> #include <rte_net_crc.h> -#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) +#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \ + && defined(RTE_MACHINE_CPUFLAG_AVX512F) +#define X86_64_AVX512F_PCLMULQDQ 1 +#elif defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) #define X86_64_SSE42_PCLMULQDQ 1 #elif defined(RTE_ARCH_ARM64) && defined(RTE_MACHINE_CPUFLAG_PMULL) #define ARM64_NEON_PMULL 1 #endif +#ifdef X86_64_AVX512F_PCLMULQDQ +#include <net_crc_avx.h> +#endif #ifdef X86_64_SSE42_PCLMULQDQ #include <net_crc_sse.h> #elif defined ARM64_NEON_PMULL @@ -48,6 +54,12 @@ [RTE_NET_CRC32_ETH] = rte_crc32_eth_handler, }; +#ifdef X86_64_AVX512F_PCLMULQDQ +static rte_net_crc_handler handlers_avx512[] = { + [RTE_NET_CRC32_ETH] = rte_crc32_eth_avx512_handler, +}; +#endif + #ifdef X86_64_SSE42_PCLMULQDQ static rte_net_crc_handler handlers_sse42[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler, @@ -157,6 +169,11 @@ handlers = handlers_neon; break; } +#elif defined X86_64_AVX512F_PCLMULQDQ + /* fall-through */ + case RTE_NET_CRC_AVX512: + handlers = handlers_avx512; + break; #endif /* fall-through */ case RTE_NET_CRC_SCALAR: @@ -197,6 +214,10 @@ rte_net_crc_neon_init(); } #endif +#ifdef X86_64_AVX512F_PCLMULQDQ + alg = RTE_NET_CRC_AVX512; + rte_net_crc_avx512_init(); +#endif rte_net_crc_set_alg(alg); } diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h index 16e85ca..a7d2ed0 100644 --- a/lib/librte_net/rte_net_crc.h +++ b/lib/librte_net/rte_net_crc.h @@ -23,6 +23,7 @@ enum rte_net_crc_alg { RTE_NET_CRC_SCALAR = 0, RTE_NET_CRC_SSE42, RTE_NET_CRC_NEON, + RTE_NET_CRC_AVX512, }; /** -- 1.7.0.7 ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC 2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh @ 2020-09-10 12:27 ` Bruce Richardson 2020-09-10 12:52 ` O'loingsigh, Mairtin 2020-09-29 15:45 ` O'loingsigh, Mairtin 2020-09-11 9:57 ` De Lara Guarch, Pablo ` (3 subsequent siblings) 4 siblings, 2 replies; 10+ messages in thread From: Bruce Richardson @ 2020-09-10 12:27 UTC (permalink / raw) To: Mairtin o Loingsigh Cc: jasvinder.singh, dev, brendan.ryan, David.Coyle, pablo.de.lara.guarch On Thu, Sep 10, 2020 at 01:01:11PM +0100, Mairtin o Loingsigh wrote: > This patch enables the generation of CRC using AVX512 instruction > set when available on the host platform. > > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> > --- > > v1: > * Initial version, with AVX512 support for CRC32 Ethernet only > (requires further updates) > * AVX512 support for CRC16-CCITT and final implementation of > CRC32 Ethernet will be added in v2 > --- > doc/guides/rel_notes/release_20_11.rst | 4 + > lib/librte_net/net_crc_avx.h | 331 ++++++++++++++++++++++++++++++++ > lib/librte_net/rte_net_crc.c | 23 ++- > lib/librte_net/rte_net_crc.h | 1 + > 4 files changed, 358 insertions(+), 1 deletions(-) > create mode 100644 lib/librte_net/net_crc_avx.h > <snip> > --- a/lib/librte_net/rte_net_crc.c > +++ b/lib/librte_net/rte_net_crc.c > @@ -10,12 +10,18 @@ > #include <rte_common.h> > #include <rte_net_crc.h> > > -#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) > +#if defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \ > + && defined(RTE_MACHINE_CPUFLAG_AVX512F) > +#define X86_64_AVX512F_PCLMULQDQ 1 > +#elif defined(RTE_ARCH_X86_64) && defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) This all seems to be build-time selection of path. Can you perhaps investigate adding runtime selection instead, so that this can be used from distro packages, or DPDK compiled on older systems but used on newer. See also patchset: http://patches.dpdk.org/project/dpdk/list/?series=11831 which is relevant to this too. /Bruce ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC 2020-09-10 12:27 ` Bruce Richardson @ 2020-09-10 12:52 ` O'loingsigh, Mairtin 2020-09-29 15:45 ` O'loingsigh, Mairtin 1 sibling, 0 replies; 10+ messages in thread From: O'loingsigh, Mairtin @ 2020-09-10 12:52 UTC (permalink / raw) To: Richardson, Bruce Cc: Singh, Jasvinder, dev, Ryan, Brendan, Coyle, David, De Lara Guarch, Pablo > -----Original Message----- > From: Bruce Richardson <bruce.richardson@intel.com> > Sent: Thursday, September 10, 2020 1:28 PM > To: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com> > Cc: Singh, Jasvinder <jasvinder.singh@intel.com>; dev@dpdk.org; Ryan, > Brendan <brendan.ryan@intel.com>; Coyle, David <david.coyle@intel.com>; > De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com> > Subject: Re: [dpdk-dev] [PATCH] net: add support for AVX512 when > generating CRC > > On Thu, Sep 10, 2020 at 01:01:11PM +0100, Mairtin o Loingsigh wrote: > > This patch enables the generation of CRC using AVX512 instruction set > > when available on the host platform. > > > > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> > > --- > > > > v1: > > * Initial version, with AVX512 support for CRC32 Ethernet only > > (requires further updates) > > * AVX512 support for CRC16-CCITT and final implementation of > > CRC32 Ethernet will be added in v2 > > --- > > doc/guides/rel_notes/release_20_11.rst | 4 + > > lib/librte_net/net_crc_avx.h | 331 > ++++++++++++++++++++++++++++++++ > > lib/librte_net/rte_net_crc.c | 23 ++- > > lib/librte_net/rte_net_crc.h | 1 + > > 4 files changed, 358 insertions(+), 1 deletions(-) create mode > > 100644 lib/librte_net/net_crc_avx.h > > > <snip> > > --- a/lib/librte_net/rte_net_crc.c > > +++ b/lib/librte_net/rte_net_crc.c > > @@ -10,12 +10,18 @@ > > #include <rte_common.h> > > #include <rte_net_crc.h> > > > > -#if defined(RTE_ARCH_X86_64) && > > defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) > > +#if defined(RTE_ARCH_X86_64) && > defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \ > > + && defined(RTE_MACHINE_CPUFLAG_AVX512F) > > +#define X86_64_AVX512F_PCLMULQDQ 1 > > +#elif defined(RTE_ARCH_X86_64) && > > +defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) > > This all seems to be build-time selection of path. Can you perhaps investigate > adding runtime selection instead, so that this can be used from distro > packages, or DPDK compiled on older systems but used on newer. > See also patchset: http://patches.dpdk.org/project/dpdk/list/?series=11831 > which is relevant to this too. > > /Bruce Sure. I will look at options for run time selection of intrinsic path ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC 2020-09-10 12:27 ` Bruce Richardson 2020-09-10 12:52 ` O'loingsigh, Mairtin @ 2020-09-29 15:45 ` O'loingsigh, Mairtin 1 sibling, 0 replies; 10+ messages in thread From: O'loingsigh, Mairtin @ 2020-09-29 15:45 UTC (permalink / raw) To: Richardson, Bruce Cc: Singh, Jasvinder, dev, Ryan, Brendan, Coyle, David, De Lara Guarch, Pablo Hi, > -----Original Message----- > From: Bruce Richardson <bruce.richardson@intel.com> > Sent: Thursday, September 10, 2020 1:28 PM > To: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com> > Cc: Singh, Jasvinder <jasvinder.singh@intel.com>; dev@dpdk.org; Ryan, > Brendan <brendan.ryan@intel.com>; Coyle, David <david.coyle@intel.com>; > De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com> > Subject: Re: [dpdk-dev] [PATCH] net: add support for AVX512 when > generating CRC > > On Thu, Sep 10, 2020 at 01:01:11PM +0100, Mairtin o Loingsigh wrote: > > This patch enables the generation of CRC using AVX512 instruction set > > when available on the host platform. > > > > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> > > --- > > > > v1: > > * Initial version, with AVX512 support for CRC32 Ethernet only > > (requires further updates) > > * AVX512 support for CRC16-CCITT and final implementation of > > CRC32 Ethernet will be added in v2 > > --- > > doc/guides/rel_notes/release_20_11.rst | 4 + > > lib/librte_net/net_crc_avx.h | 331 > ++++++++++++++++++++++++++++++++ > > lib/librte_net/rte_net_crc.c | 23 ++- > > lib/librte_net/rte_net_crc.h | 1 + > > 4 files changed, 358 insertions(+), 1 deletions(-) create mode > > 100644 lib/librte_net/net_crc_avx.h > > > <snip> > > --- a/lib/librte_net/rte_net_crc.c > > +++ b/lib/librte_net/rte_net_crc.c > > @@ -10,12 +10,18 @@ > > #include <rte_common.h> > > #include <rte_net_crc.h> > > > > -#if defined(RTE_ARCH_X86_64) && > > defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) > > +#if defined(RTE_ARCH_X86_64) && > defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) \ > > + && defined(RTE_MACHINE_CPUFLAG_AVX512F) > > +#define X86_64_AVX512F_PCLMULQDQ 1 > > +#elif defined(RTE_ARCH_X86_64) && > > +defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ) > > This all seems to be build-time selection of path. Can you perhaps investigate > adding runtime selection instead, so that this can be used from distro > packages, or DPDK compiled on older systems but used on newer. > See also patchset: http://patches.dpdk.org/project/dpdk/list/?series=11831 > which is relevant to this too. > > /Bruce We have added runtime check for v3 of patch which we have submitted Mairtin ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC 2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh 2020-09-10 12:27 ` Bruce Richardson @ 2020-09-11 9:57 ` De Lara Guarch, Pablo 2020-09-29 15:47 ` O'loingsigh, Mairtin 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh ` (2 subsequent siblings) 4 siblings, 1 reply; 10+ messages in thread From: De Lara Guarch, Pablo @ 2020-09-11 9:57 UTC (permalink / raw) To: O'loingsigh, Mairtin, Singh, Jasvinder Cc: dev, Ryan, Brendan, Coyle, David Hi Mairtin, > -----Original Message----- > From: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com> > Sent: Thursday, September 10, 2020 1:01 PM > To: Singh, Jasvinder <jasvinder.singh@intel.com> > Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, David > <david.coyle@intel.com>; De Lara Guarch, Pablo > <pablo.de.lara.guarch@intel.com>; O'loingsigh, Mairtin > <mairtin.oloingsigh@intel.com> > Subject: [PATCH] net: add support for AVX512 when generating CRC > > This patch enables the generation of CRC using AVX512 instruction set when > available on the host platform. > > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> > --- > > v1: > * Initial version, with AVX512 support for CRC32 Ethernet only (requires further > updates) > * AVX512 support for CRC16-CCITT and final implementation of > CRC32 Ethernet will be added in v2 > --- > doc/guides/rel_notes/release_20_11.rst | 4 + > lib/librte_net/net_crc_avx.h | 331 ++++++++++++++++++++++++++++++++ > lib/librte_net/rte_net_crc.c | 23 ++- > lib/librte_net/rte_net_crc.h | 1 + > 4 files changed, 358 insertions(+), 1 deletions(-) create mode 100644 > lib/librte_net/net_crc_avx.h > > diff --git a/doc/guides/rel_notes/release_20_11.rst > b/doc/guides/rel_notes/release_20_11.rst > index df227a1..d6a84ca 100644 > --- a/doc/guides/rel_notes/release_20_11.rst > +++ b/doc/guides/rel_notes/release_20_11.rst > @@ -55,6 +55,10 @@ New Features > Also, make sure to start the actual text at the margin. > ======================================================= > > +* **Added support for AVX512 in rte_net CRC calculations.** > + > + Added new CRC32 calculation code using AVX512 instruction set Added > + new CRC16-CCITT calculation code using AVX512 instruction set > > Removed Items > ------------- > diff --git a/lib/librte_net/net_crc_avx.h b/lib/librte_net/net_crc_avx.h new file > mode 100644 index 0000000..d9481d5 > --- /dev/null > +++ b/lib/librte_net/net_crc_avx.h ... > +static __rte_always_inline uint32_t > +crc32_eth_calc_pclmulqdq( > + const uint8_t *data, > + uint32_t data_len, > + uint32_t crc, > + const struct crc_pclmulqdq512_ctx *params) { > + __m256i b; > + __m512i temp, k; > + __m512i qw0 = _mm512_set1_epi64(0); > + __m512i fold0; > + uint32_t n; This is loading 64 bytes of data, but if seems like only 16 are available, right? Should we use _mm_loadu_si128? > + fold0 = _mm512_xor_si512(fold0, temp); > + goto reduction_128_64; > + } > + > + if (unlikely(data_len < 16)) { > + /* 0 to 15 bytes */ > + uint8_t buffer[16] __rte_aligned(16); > + > + memset(buffer, 0, sizeof(buffer)); > + memcpy(buffer, data, data_len); I would use _mm_maskz_loadu_epi8, passing a mask register with ((1 << data_len) - 1). > + > + fold0 = _mm512_load_si512((const __m128i *)buffer); > + fold0 = _mm512_xor_si512(fold0, temp); > + if (unlikely(data_len < 4)) { > + fold0 = xmm_shift_left(fold0, 8 - data_len); > + goto barret_reduction; > + } > + fold0 = xmm_shift_left(fold0, 16 - data_len); > + goto reduction_128_64; > + } > + /* 17 to 31 bytes */ > + fold0 = _mm512_loadu_si512((const __m512i *)data); Same here. Looks like you are loading too much data? > + fold0 = _mm512_xor_si512(fold0, temp); > + n = 16; > + k = params->rk1_rk2; > + goto partial_bytes; > + } ... > + > + fold0 = _mm512_xor_si512(fold0, temp); > + fold0 = _mm512_xor_si512(fold0, b); You could use _mm512_ternarylogic_epi64 with 0x96 as to do 2x XORs in one instruction. > + } > + > + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ > +reduction_128_64: > + k = params->rk5_rk6; > + > +barret_reduction: > + k = params->rk7_rk8; > + n = crcr32_reduce_64_to_32(fold0, k); > + > + return n; > +} > + > + ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC 2020-09-11 9:57 ` De Lara Guarch, Pablo @ 2020-09-29 15:47 ` O'loingsigh, Mairtin 0 siblings, 0 replies; 10+ messages in thread From: O'loingsigh, Mairtin @ 2020-09-29 15:47 UTC (permalink / raw) To: De Lara Guarch, Pablo, Singh, Jasvinder; +Cc: dev, Ryan, Brendan, Coyle, David Hi, > -----Original Message----- > From: De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com> > Sent: Friday, September 11, 2020 10:58 AM > To: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com>; Singh, Jasvinder > <jasvinder.singh@intel.com> > Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, David > <david.coyle@intel.com> > Subject: RE: [PATCH] net: add support for AVX512 when generating CRC > > Hi Mairtin, > > > -----Original Message----- > > From: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com> > > Sent: Thursday, September 10, 2020 1:01 PM > > To: Singh, Jasvinder <jasvinder.singh@intel.com> > > Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, > David > > <david.coyle@intel.com>; De Lara Guarch, Pablo > > <pablo.de.lara.guarch@intel.com>; O'loingsigh, Mairtin > > <mairtin.oloingsigh@intel.com> > > Subject: [PATCH] net: add support for AVX512 when generating CRC > > > > This patch enables the generation of CRC using AVX512 instruction set > > when available on the host platform. > > > > Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> > > --- > > > > v1: > > * Initial version, with AVX512 support for CRC32 Ethernet only > > (requires further > > updates) > > * AVX512 support for CRC16-CCITT and final implementation of > > CRC32 Ethernet will be added in v2 > > --- > > doc/guides/rel_notes/release_20_11.rst | 4 + > > lib/librte_net/net_crc_avx.h | 331 > ++++++++++++++++++++++++++++++++ > > lib/librte_net/rte_net_crc.c | 23 ++- > > lib/librte_net/rte_net_crc.h | 1 + > > 4 files changed, 358 insertions(+), 1 deletions(-) create mode > > 100644 lib/librte_net/net_crc_avx.h > > > > diff --git a/doc/guides/rel_notes/release_20_11.rst > > b/doc/guides/rel_notes/release_20_11.rst > > index df227a1..d6a84ca 100644 > > --- a/doc/guides/rel_notes/release_20_11.rst > > +++ b/doc/guides/rel_notes/release_20_11.rst > > @@ -55,6 +55,10 @@ New Features > > Also, make sure to start the actual text at the margin. > > ======================================================= > > > > +* **Added support for AVX512 in rte_net CRC calculations.** > > + > > + Added new CRC32 calculation code using AVX512 instruction set > > + Added new CRC16-CCITT calculation code using AVX512 instruction set > > > > Removed Items > > ------------- > > diff --git a/lib/librte_net/net_crc_avx.h > > b/lib/librte_net/net_crc_avx.h new file mode 100644 index > > 0000000..d9481d5 > > --- /dev/null > > +++ b/lib/librte_net/net_crc_avx.h > > ... > > > +static __rte_always_inline uint32_t > > +crc32_eth_calc_pclmulqdq( > > + const uint8_t *data, > > + uint32_t data_len, > > + uint32_t crc, > > + const struct crc_pclmulqdq512_ctx *params) { > > + __m256i b; > > + __m512i temp, k; > > + __m512i qw0 = _mm512_set1_epi64(0); > > + __m512i fold0; > > + uint32_t n; > > This is loading 64 bytes of data, but if seems like only 16 are available, right? > Should we use _mm_loadu_si128? > > > + fold0 = _mm512_xor_si512(fold0, temp); > > + goto reduction_128_64; > > + } > > + > > + if (unlikely(data_len < 16)) { > > + /* 0 to 15 bytes */ > > + uint8_t buffer[16] __rte_aligned(16); > > + > > + memset(buffer, 0, sizeof(buffer)); > > + memcpy(buffer, data, data_len); > > I would use _mm_maskz_loadu_epi8, passing a mask register with ((1 << > data_len) - 1). > > > + > > + fold0 = _mm512_load_si512((const __m128i > *)buffer); > > + fold0 = _mm512_xor_si512(fold0, temp); > > + if (unlikely(data_len < 4)) { > > + fold0 = xmm_shift_left(fold0, 8 - data_len); > > + goto barret_reduction; > > + } > > + fold0 = xmm_shift_left(fold0, 16 - data_len); > > + goto reduction_128_64; > > + } > > + /* 17 to 31 bytes */ > > + fold0 = _mm512_loadu_si512((const __m512i *)data); > > Same here. Looks like you are loading too much data? > > > + fold0 = _mm512_xor_si512(fold0, temp); > > + n = 16; > > + k = params->rk1_rk2; > > + goto partial_bytes; > > + } > > ... > > > + > > + fold0 = _mm512_xor_si512(fold0, temp); > > + fold0 = _mm512_xor_si512(fold0, b); > > You could use _mm512_ternarylogic_epi64 with 0x96 as to do 2x XORs in one > instruction. > > > + } > > + > > + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ > > +reduction_128_64: > > + k = params->rk5_rk6; > > + > > +barret_reduction: > > + k = params->rk7_rk8; > > + n = crcr32_reduce_64_to_32(fold0, k); > > + > > + return n; > > +} > > + > > + The latest version of this patch (v3) reworks a lot of this code and address the issues noted above Mairtin ^ permalink raw reply [flat|nested] 10+ messages in thread
* [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC 2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh 2020-09-10 12:27 ` Bruce Richardson 2020-09-11 9:57 ` De Lara Guarch, Pablo @ 2020-09-29 15:12 ` Mairtin o Loingsigh 2020-09-29 15:41 ` O'loingsigh, Mairtin 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection Mairtin o Loingsigh 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh 4 siblings, 1 reply; 10+ messages in thread From: Mairtin o Loingsigh @ 2020-09-29 15:12 UTC (permalink / raw) To: jasvinder.singh, bruce.richardson, pablo.de.lara.guarch Cc: dev, brendan.ryan, david.coyle, Mairtin o Loingsigh This patchset makes two significant enhancements to the CRC modules of the rte_net library: 1) Adds run-time selection of the optimal architecture-specific CRC path. Previously the selection was solely made at compile-time, meaning it could only be built and run on the same generation of CPU. Adding run-time selection ability means this can be used from distro packages and/or DPDK can be compiled on an older CPU and run on a newer CPU. 2) Adds an optimized CRC implementation based on the AVX512 and VPCLMULQDQ instruction sets. For further details, please see the commit messages of the individual patches. v2: * Added support for run-time selection of optimal architecture-specific CRC, based on v1 review comment. * Added full working AVX512/VPCLMULDQD support for CRC32-Ethernet and CRC16-CCITT. v1: * Initial version, with incomplete AVX512/VPCLMULDQD support for CRC32-Ethernet only. Mairtin o Loingsigh (2): net: add run-time architecture specific CRC selection net: add support for AVX512/VPCLMULQDQ based CRC app/test/test_crc.c | 11 +- config/x86/meson.build | 6 +- doc/guides/rel_notes/release_20_11.rst | 6 + lib/librte_net/meson.build | 89 ++++- lib/librte_net/net_crc.h | 45 +++ lib/librte_net/net_crc_avx512.c | 424 ++++++++++++++++++++++ lib/librte_net/{net_crc_neon.h => net_crc_neon.c} | 27 +- lib/librte_net/{net_crc_sse.h => net_crc_sse.c} | 34 +- lib/librte_net/rte_net_crc.c | 100 +++-- lib/librte_net/rte_net_crc.h | 4 +- 10 files changed, 674 insertions(+), 72 deletions(-) create mode 100644 lib/librte_net/net_crc.h create mode 100644 lib/librte_net/net_crc_avx512.c rename lib/librte_net/{net_crc_neon.h => net_crc_neon.c} (95%) rename lib/librte_net/{net_crc_sse.h => net_crc_sse.c} (94%) -- 2.12.3 ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh @ 2020-09-29 15:41 ` O'loingsigh, Mairtin 0 siblings, 0 replies; 10+ messages in thread From: O'loingsigh, Mairtin @ 2020-09-29 15:41 UTC (permalink / raw) To: Singh, Jasvinder, Richardson, Bruce, De Lara Guarch, Pablo Cc: dev, Ryan, Brendan, Coyle, David Hi, > -----Original Message----- > From: O'loingsigh, Mairtin <mairtin.oloingsigh@intel.com> > Sent: Tuesday, September 29, 2020 4:13 PM > To: Singh, Jasvinder <jasvinder.singh@intel.com>; Richardson, Bruce > <bruce.richardson@intel.com>; De Lara Guarch, Pablo > <pablo.de.lara.guarch@intel.com> > Cc: dev@dpdk.org; Ryan, Brendan <brendan.ryan@intel.com>; Coyle, David > <david.coyle@intel.com>; O'loingsigh, Mairtin > <mairtin.oloingsigh@intel.com> > Subject: [PATCH v2 0/2] net: add CRC run-time checks and > AVX512/VPCLMULQDQ based CRC > > This patchset makes two significant enhancements to the CRC modules of > the rte_net library: > > 1) Adds run-time selection of the optimal architecture-specific CRC path. > Previously the selection was solely made at compile-time, meaning it > could only be built and run on the same generation of CPU. Adding > run-time selection ability means this can be used from distro packages > and/or DPDK can be compiled on an older CPU and run on a newer CPU. > 2) Adds an optimized CRC implementation based on the AVX512 and > VPCLMULQDQ instruction sets. > > For further details, please see the commit messages of the individual > patches. > > v2: > * Added support for run-time selection of optimal architecture-specific > CRC, based on v1 review comment. > * Added full working AVX512/VPCLMULDQD support for CRC32-Ethernet and > CRC16-CCITT. > > v1: > * Initial version, with incomplete AVX512/VPCLMULDQD support for > CRC32-Ethernet only. > > Mairtin o Loingsigh (2): > net: add run-time architecture specific CRC selection > net: add support for AVX512/VPCLMULQDQ based CRC > > app/test/test_crc.c | 11 +- > config/x86/meson.build | 6 +- > doc/guides/rel_notes/release_20_11.rst | 6 + > lib/librte_net/meson.build | 89 ++++- > lib/librte_net/net_crc.h | 45 +++ > lib/librte_net/net_crc_avx512.c | 424 ++++++++++++++++++++++ > lib/librte_net/{net_crc_neon.h => net_crc_neon.c} | 27 +- > lib/librte_net/{net_crc_sse.h => net_crc_sse.c} | 34 +- > lib/librte_net/rte_net_crc.c | 100 +++-- > lib/librte_net/rte_net_crc.h | 4 +- > 10 files changed, 674 insertions(+), 72 deletions(-) create mode 100644 > lib/librte_net/net_crc.h create mode 100644 lib/librte_net/net_crc_avx512.c > rename lib/librte_net/{net_crc_neon.h => net_crc_neon.c} (95%) rename > lib/librte_net/{net_crc_sse.h => net_crc_sse.c} (94%) > > -- > 2.12.3 I encountered a problem submitting the v2 patch and have submitted a v3 Regards, Mairtin ^ permalink raw reply [flat|nested] 10+ messages in thread
* [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection 2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh ` (2 preceding siblings ...) 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh @ 2020-09-29 15:12 ` Mairtin o Loingsigh 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh 4 siblings, 0 replies; 10+ messages in thread From: Mairtin o Loingsigh @ 2020-09-29 15:12 UTC (permalink / raw) To: jasvinder.singh, bruce.richardson, pablo.de.lara.guarch Cc: dev, brendan.ryan, david.coyle, Mairtin o Loingsigh This patch adds support for run-time selection of the optimal architecture-specific CRC path, based on the supported instruction set(s) of the CPU. The compiler option checks have been moved from the C files to the meson script. The rte_cpu_get_flag_enabled function is called automatically by the library at process initialization time to determine which instructions the CPU supports, with the most optimal supported CRC path ultimately selected. Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> Signed-off-by: David Coyle <david.coyle@intel.com> --- doc/guides/rel_notes/release_20_11.rst | 4 ++ lib/librte_net/meson.build | 34 +++++++++++- lib/librte_net/net_crc.h | 34 ++++++++++++ lib/librte_net/{net_crc_neon.h => net_crc_neon.c} | 27 +++------ lib/librte_net/{net_crc_sse.h => net_crc_sse.c} | 34 ++++-------- lib/librte_net/rte_net_crc.c | 67 ++++++++++++++--------- 6 files changed, 132 insertions(+), 68 deletions(-) create mode 100644 lib/librte_net/net_crc.h rename lib/librte_net/{net_crc_neon.h => net_crc_neon.c} (95%) rename lib/librte_net/{net_crc_sse.h => net_crc_sse.c} (94%) diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst index 4eb3224a7..6bd222dca 100644 --- a/doc/guides/rel_notes/release_20_11.rst +++ b/doc/guides/rel_notes/release_20_11.rst @@ -55,6 +55,10 @@ New Features Also, make sure to start the actual text at the margin. ======================================================= +* **Updated CRC modules of rte_net library.** + + * Added run-time selection of the optimal architecture-specific CRC path. + * **Updated Cisco enic driver.** * Added support for VF representors with single-queue Tx/Rx and flow API diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build index 24ed8253b..b6880bd85 100644 --- a/lib/librte_net/meson.build +++ b/lib/librte_net/meson.build @@ -1,5 +1,5 @@ # SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017 Intel Corporation +# Copyright(c) 2017-2020 Intel Corporation headers = files('rte_ip.h', 'rte_tcp.h', @@ -20,3 +20,35 @@ headers = files('rte_ip.h', sources = files('rte_arp.c', 'rte_ether.c', 'rte_net.c', 'rte_net_crc.c') deps += ['mbuf'] + +if dpdk_conf.has('RTE_ARCH_X86_64') + net_crc_sse42_cpu_support = \ + cc.get_define('__PCLMUL__', args: machine_args) != '' + net_crc_sse42_cc_support = \ + cc.has_argument('-mpclmul') and cc.has_argument('-maes') + + build_static_net_crc_sse42_lib = 0 + + if net_crc_sse42_cpu_support == true + sources += files('net_crc_sse.c') + cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT'] + elif net_crc_sse42_cc_support == true + build_static_net_crc_sse42_lib = 1 + net_crc_sse42_lib_cflags = ['-mpclmul', '-maes'] + cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT'] + endif + + if build_static_net_crc_sse42_lib == 1 + net_crc_sse42_lib = static_library( + 'net_crc_sse42_lib', + 'net_crc_sse.c', + dependencies: static_rte_eal, + c_args: [cflags, + net_crc_sse42_lib_cflags]) + objs += net_crc_sse42_lib.extract_objects('net_crc_sse.c') + endif +elif dpdk_conf.has('RTE_ARCH_ARM64') and \ + cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != '' + sources += files('net_crc_neon.c') + cflags += ['-DCC_ARM64_NEON_PMULL_SUPPORT'] +endif diff --git a/lib/librte_net/net_crc.h b/lib/librte_net/net_crc.h new file mode 100644 index 000000000..a1578a56c --- /dev/null +++ b/lib/librte_net/net_crc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _NET_CRC_H_ +#define _NET_CRC_H_ + +/* + * Different implementations of CRC + */ + +/* SSE4.2 */ + +void +rte_net_crc_sse42_init(void); + +uint32_t +rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len); + +uint32_t +rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len); + +/* NEON */ + +void +rte_net_crc_neon_init(void); + +uint32_t +rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len); + +uint32_t +rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len); + +#endif /* _NET_CRC_H_ */ diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.c similarity index 95% rename from lib/librte_net/net_crc_neon.h rename to lib/librte_net/net_crc_neon.c index 63fa1d4a1..b79684ec2 100644 --- a/lib/librte_net/net_crc_neon.h +++ b/lib/librte_net/net_crc_neon.c @@ -1,18 +1,17 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2017 Cavium, Inc + * Copyright(c) 2020 Intel Corporation */ -#ifndef _NET_CRC_NEON_H_ -#define _NET_CRC_NEON_H_ +#include <string.h> +#include <rte_common.h> #include <rte_branch_prediction.h> #include <rte_net_crc.h> #include <rte_vect.h> #include <rte_cpuflags.h> -#ifdef __cplusplus -extern "C" { -#endif +#include "net_crc.h" /** PMULL CRC computation context structure */ struct crc_pmull_ctx { @@ -218,7 +217,7 @@ crc32_eth_calc_pmull( return n; } -static inline void +void rte_net_crc_neon_init(void) { /* Initialize CRC16 data */ @@ -242,9 +241,8 @@ rte_net_crc_neon_init(void) crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8); } -static inline uint32_t -rte_crc16_ccitt_neon_handler(const uint8_t *data, - uint32_t data_len) +uint32_t +rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len) { return (uint16_t)~crc32_eth_calc_pmull(data, data_len, @@ -252,18 +250,11 @@ rte_crc16_ccitt_neon_handler(const uint8_t *data, &crc16_ccitt_pmull); } -static inline uint32_t -rte_crc32_eth_neon_handler(const uint8_t *data, - uint32_t data_len) +uint32_t +rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len) { return ~crc32_eth_calc_pmull(data, data_len, 0xffffffffUL, &crc32_eth_pmull); } - -#ifdef __cplusplus -} -#endif - -#endif /* _NET_CRC_NEON_H_ */ diff --git a/lib/librte_net/net_crc_sse.h b/lib/librte_net/net_crc_sse.c similarity index 94% rename from lib/librte_net/net_crc_sse.h rename to lib/librte_net/net_crc_sse.c index 1c7b7a548..053b54b39 100644 --- a/lib/librte_net/net_crc_sse.h +++ b/lib/librte_net/net_crc_sse.c @@ -1,18 +1,16 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation + * Copyright(c) 2017-2020 Intel Corporation */ -#ifndef _RTE_NET_CRC_SSE_H_ -#define _RTE_NET_CRC_SSE_H_ +#include <string.h> +#include <rte_common.h> #include <rte_branch_prediction.h> +#include <rte_cpuflags.h> -#include <x86intrin.h> -#include <cpuid.h> +#include "net_crc.h" -#ifdef __cplusplus -extern "C" { -#endif +#include <x86intrin.h> /** PCLMULQDQ CRC computation context structure */ struct crc_pclmulqdq_ctx { @@ -259,8 +257,7 @@ crc32_eth_calc_pclmulqdq( return n; } - -static inline void +void rte_net_crc_sse42_init(void) { uint64_t k1, k2, k5, k6; @@ -303,12 +300,10 @@ rte_net_crc_sse42_init(void) * use other data types such as float, double, etc. */ _mm_empty(); - } -static inline uint32_t -rte_crc16_ccitt_sse42_handler(const uint8_t *data, - uint32_t data_len) +uint32_t +rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len) { /** return 16-bit CRC value */ return (uint16_t)~crc32_eth_calc_pclmulqdq(data, @@ -317,18 +312,11 @@ rte_crc16_ccitt_sse42_handler(const uint8_t *data, &crc16_ccitt_pclmulqdq); } -static inline uint32_t -rte_crc32_eth_sse42_handler(const uint8_t *data, - uint32_t data_len) +uint32_t +rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len) { return ~crc32_eth_calc_pclmulqdq(data, data_len, 0xffffffffUL, &crc32_eth_pclmulqdq); } - -#ifdef __cplusplus -} -#endif - -#endif /* _RTE_NET_CRC_SSE_H_ */ diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c index 4f5b9e828..83dccbfba 100644 --- a/lib/librte_net/rte_net_crc.c +++ b/lib/librte_net/rte_net_crc.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation + * Copyright(c) 2017-2020 Intel Corporation */ #include <stddef.h> @@ -10,17 +10,7 @@ #include <rte_common.h> #include <rte_net_crc.h> -#if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__) -#define X86_64_SSE42_PCLMULQDQ 1 -#elif defined(RTE_ARCH_ARM64) && defined(__ARM_FEATURE_CRYPTO) -#define ARM64_NEON_PMULL 1 -#endif - -#ifdef X86_64_SSE42_PCLMULQDQ -#include <net_crc_sse.h> -#elif defined ARM64_NEON_PMULL -#include <net_crc_neon.h> -#endif +#include "net_crc.h" /** CRC polynomials */ #define CRC32_ETH_POLYNOMIAL 0x04c11db7UL @@ -47,13 +37,13 @@ static rte_net_crc_handler handlers_scalar[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_handler, }; - -#ifdef X86_64_SSE42_PCLMULQDQ +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT static rte_net_crc_handler handlers_sse42[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler, }; -#elif defined ARM64_NEON_PMULL +#endif +#ifdef CC_ARM64_NEON_PMULL_SUPPORT static rte_net_crc_handler handlers_neon[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler, @@ -142,22 +132,44 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len) crc32_eth_lut); } +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT +static uint8_t +sse42_pclmulqdq_cpu_supported(void) +{ + return rte_cpu_get_flag_enabled(RTE_CPUFLAG_PCLMULQDQ); +} +#endif + +#ifdef CC_ARM64_NEON_PMULL_SUPPORT +static uint8_t +neon_pmull_cpu_supported(void) +{ + return rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL); +} +#endif + void rte_net_crc_set_alg(enum rte_net_crc_alg alg) { switch (alg) { -#ifdef X86_64_SSE42_PCLMULQDQ +#ifdef RTE_ARCH_X86_64 case RTE_NET_CRC_SSE42: - handlers = handlers_sse42; - break; -#elif defined ARM64_NEON_PMULL - /* fall-through */ +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT + if (sse42_pclmulqdq_cpu_supported()) { + handlers = handlers_sse42; + break; + } +#endif +#endif /* RTE_ARCH_X86_64 */ +#ifdef RTE_ARCH_ARM64 case RTE_NET_CRC_NEON: - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) { +#ifdef CC_ARM64_NEON_PMULL_SUPPORT + if (neon_pmull_cpu_supported()) { handlers = handlers_neon; break; } #endif +#endif /* RTE_ARCH_ARM64 */ /* fall-through */ case RTE_NET_CRC_SCALAR: /* fall-through */ @@ -188,11 +200,14 @@ RTE_INIT(rte_net_crc_init) rte_net_crc_scalar_init(); -#ifdef X86_64_SSE42_PCLMULQDQ - alg = RTE_NET_CRC_SSE42; - rte_net_crc_sse42_init(); -#elif defined ARM64_NEON_PMULL - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) { +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT + if (sse42_pclmulqdq_cpu_supported()) { + alg = RTE_NET_CRC_SSE42; + rte_net_crc_sse42_init(); + } +#endif +#ifdef CC_ARM64_NEON_PMULL_SUPPORT + if (neon_pmull_cpu_supported()) { alg = RTE_NET_CRC_NEON; rte_net_crc_neon_init(); } -- 2.12.3 ^ permalink raw reply [flat|nested] 10+ messages in thread
* [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC 2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh ` (3 preceding siblings ...) 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection Mairtin o Loingsigh @ 2020-09-29 15:12 ` Mairtin o Loingsigh 4 siblings, 0 replies; 10+ messages in thread From: Mairtin o Loingsigh @ 2020-09-29 15:12 UTC (permalink / raw) To: jasvinder.singh, bruce.richardson, pablo.de.lara.guarch Cc: dev, brendan.ryan, david.coyle, Mairtin o Loingsigh This patch enables the optimized calculation of CRC32-Ethernet and CRC16-CCITT using the AVX512 and VPCLMULQDQ instruction sets. This CRC implementation is built if the compiler supports the required instruction sets. It is selected at run-time if the host CPU, again, supports the required instruction sets. Signed-off-by: Mairtin o Loingsigh <mairtin.oloingsigh@intel.com> Signed-off-by: David Coyle <david.coyle@intel.com> --- app/test/test_crc.c | 11 +- config/x86/meson.build | 6 +- doc/guides/rel_notes/release_20_11.rst | 2 + lib/librte_net/meson.build | 55 +++++ lib/librte_net/net_crc.h | 11 + lib/librte_net/net_crc_avx512.c | 424 +++++++++++++++++++++++++++++++++ lib/librte_net/rte_net_crc.c | 33 +++ lib/librte_net/rte_net_crc.h | 4 +- 8 files changed, 542 insertions(+), 4 deletions(-) create mode 100644 lib/librte_net/net_crc_avx512.c diff --git a/app/test/test_crc.c b/app/test/test_crc.c index f8a74e04e..bf1d34435 100644 --- a/app/test/test_crc.c +++ b/app/test/test_crc.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation + * Copyright(c) 2017-2020 Intel Corporation */ #include "test.h" @@ -149,6 +149,15 @@ test_crc(void) return ret; } + /* set CRC avx512 mode */ + rte_net_crc_set_alg(RTE_NET_CRC_AVX512); + + ret = test_crc_calc(); + if (ret < 0) { + printf("test crc (x86_64 AVX512): failed (%d)\n", ret); + return ret; + } + /* set CRC neon mode */ rte_net_crc_set_alg(RTE_NET_CRC_NEON); diff --git a/config/x86/meson.build b/config/x86/meson.build index fea4d5403..172b72b72 100644 --- a/config/x86/meson.build +++ b/config/x86/meson.build @@ -1,5 +1,5 @@ # SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017-2019 Intel Corporation +# Copyright(c) 2017-2020 Intel Corporation # get binutils version for the workaround of Bug 97 if not is_windows @@ -23,7 +23,9 @@ endforeach optional_flags = ['AES', 'PCLMUL', 'AVX', 'AVX2', 'AVX512F', - 'RDRND', 'RDSEED'] + 'RDRND', 'RDSEED', + 'AVX512BW', 'AVX512DQ', + 'AVX512VL', 'VPCLMULQDQ'] foreach f:optional_flags if cc.get_define('__@0@__'.format(f), args: machine_args) == '1' if f == 'PCLMUL' # special case flags with different defines diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst index 6bd222dca..509749ebd 100644 --- a/doc/guides/rel_notes/release_20_11.rst +++ b/doc/guides/rel_notes/release_20_11.rst @@ -58,6 +58,8 @@ New Features * **Updated CRC modules of rte_net library.** * Added run-time selection of the optimal architecture-specific CRC path. + * Added optimized implementations of CRC32-Ethernet and CRC16-CCITT + using the AVX512 and VPCLMULQDQ instruction sets. * **Updated Cisco enic driver.** diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build index b6880bd85..eeae25bc1 100644 --- a/lib/librte_net/meson.build +++ b/lib/librte_net/meson.build @@ -24,18 +24,62 @@ deps += ['mbuf'] if dpdk_conf.has('RTE_ARCH_X86_64') net_crc_sse42_cpu_support = \ cc.get_define('__PCLMUL__', args: machine_args) != '' + net_crc_avx512_cpu_support = \ + cc.get_define('__AVX512F__', args: machine_args) != '' and \ + cc.get_define('__AVX512BW__', args: machine_args) != '' and \ + cc.get_define('__AVX512DQ__', args: machine_args) != '' and \ + cc.get_define('__AVX512VL__', args: machine_args) != '' and \ + cc.get_define('__VPCLMULQDQ__', args: machine_args) != '' + net_crc_sse42_cc_support = \ cc.has_argument('-mpclmul') and cc.has_argument('-maes') + net_crc_avx512_cc_support = \ + not machine_args.contains('-mno-avx512f') and \ + cc.has_argument('-mavx512f') and \ + cc.has_argument('-mavx512bw') and \ + cc.has_argument('-mavx512dq') and \ + cc.has_argument('-mavx512vl') and \ + cc.has_argument('-mvpclmulqdq') and \ + cc.has_argument('-mavx2') and \ + cc.has_argument('-mavx') build_static_net_crc_sse42_lib = 0 + build_static_net_crc_avx512_lib = 0 if net_crc_sse42_cpu_support == true sources += files('net_crc_sse.c') cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT'] + if net_crc_avx512_cpu_support == true + sources += files('net_crc_avx512.c') + cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT'] + elif net_crc_avx512_cc_support == true + build_static_net_crc_avx512_lib = 1 + net_crc_avx512_lib_cflags = ['-mavx512f', + '-mavx512bw', + '-mavx512dq', + '-mavx512vl', + '-mvpclmulqdq', + '-mavx2', + '-mavx'] + cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT'] + endif elif net_crc_sse42_cc_support == true build_static_net_crc_sse42_lib = 1 net_crc_sse42_lib_cflags = ['-mpclmul', '-maes'] cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT'] + if net_crc_avx512_cc_support == true + build_static_net_crc_avx512_lib = 1 + net_crc_avx512_lib_cflags = ['-mpclmul', + '-maes', + '-mavx512f', + '-mavx512bw', + '-mavx512dq', + '-mavx512vl', + '-mvpclmulqdq', + '-mavx2', + '-mavx'] + cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT'] + endif endif if build_static_net_crc_sse42_lib == 1 @@ -47,6 +91,17 @@ if dpdk_conf.has('RTE_ARCH_X86_64') net_crc_sse42_lib_cflags]) objs += net_crc_sse42_lib.extract_objects('net_crc_sse.c') endif + + if build_static_net_crc_avx512_lib == 1 + net_crc_avx512_lib = static_library( + 'net_crc_avx512_lib', + 'net_crc_avx512.c', + dependencies: static_rte_eal, + c_args: [cflags, + net_crc_avx512_lib_cflags]) + objs += net_crc_avx512_lib.extract_objects('net_crc_avx512.c') + endif + elif dpdk_conf.has('RTE_ARCH_ARM64') and \ cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != '' sources += files('net_crc_neon.c') diff --git a/lib/librte_net/net_crc.h b/lib/librte_net/net_crc.h index a1578a56c..7a74d5406 100644 --- a/lib/librte_net/net_crc.h +++ b/lib/librte_net/net_crc.h @@ -20,6 +20,17 @@ rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len); uint32_t rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len); +/* AVX512 */ + +void +rte_net_crc_avx512_init(void); + +uint32_t +rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len); + +uint32_t +rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len); + /* NEON */ void diff --git a/lib/librte_net/net_crc_avx512.c b/lib/librte_net/net_crc_avx512.c new file mode 100644 index 000000000..81aac6349 --- /dev/null +++ b/lib/librte_net/net_crc_avx512.c @@ -0,0 +1,424 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#include <string.h> + +#include <rte_common.h> +#include <rte_branch_prediction.h> +#include <rte_cpuflags.h> + +#include "net_crc.h" + +#include <x86intrin.h> + +/* VPCLMULQDQ CRC computation context structure */ +struct crc_vpclmulqdq_ctx { + __m512i rk1_rk2; + __m512i rk3_rk4; + __m512i fold_7x128b; + __m512i fold_3x128b; + __m128i rk5_rk6; + __m128i rk7_rk8; + __m128i fold_1x128b; +}; + +static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64); +static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64); + +static uint16_t byte_len_to_mask_table[] = { + 0x0000, 0x0001, 0x0003, 0x0007, + 0x000f, 0x001f, 0x003f, 0x007f, + 0x00ff, 0x01ff, 0x03ff, 0x07ff, + 0x0fff, 0x1fff, 0x3fff, 0x7fff, + 0xffff}; + +static const uint8_t shf_table[32] __rte_aligned(16) = { + 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; + +static const uint32_t mask[4] __rte_aligned(16) = { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 +}; + +static const uint32_t mask2[4] __rte_aligned(16) = { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff +}; + +static __rte_always_inline __m512i +crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold) +{ + __m512i tmp0, tmp1; + + tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01); + tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10); + + return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96); +} + +static __rte_always_inline __m128i +crc32_fold_128(__m512i fold0, __m512i fold1, + const struct crc_vpclmulqdq_ctx *params) +{ + __m128i res, res2; + __m256i a; + __m512i tmp0, tmp1, tmp2, tmp3; + __m512i tmp4; + + tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01); + tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10); + + res = _mm512_extracti64x2_epi64(fold1, 3); + tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res); + + tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01); + tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10); + + tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96); + tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96); + + tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e); + + a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0); + res = _mm256_extracti64x2_epi64(a, 1); + res2 = _mm_xor_si128(res, *(__m128i *)&a); + + return res2; +} + +static __rte_always_inline __m128i +last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res, + const struct crc_vpclmulqdq_ctx *params) +{ + uint32_t offset; + __m128i res2, res3, res4, pshufb_shf; + + const uint32_t mask3[4] __rte_aligned(16) = { + 0x80808080, 0x80808080, 0x80808080, 0x80808080 + }; + + res2 = res; + offset = data_len - n; + res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]); + + pshufb_shf = _mm_loadu_si128((const __m128i *) + (shf_table + (data_len-n))); + + res = _mm_shuffle_epi8(res, pshufb_shf); + pshufb_shf = _mm_xor_si128(pshufb_shf, + _mm_load_si128((const __m128i *) mask3)); + res2 = _mm_shuffle_epi8(res2, pshufb_shf); + + res2 = _mm_blendv_epi8(res2, res3, pshufb_shf); + + res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01); + res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10); + res = _mm_ternarylogic_epi64(res, res2, res4, 0x96); + + return res; +} + +static __rte_always_inline __m128i +done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params) +{ + __m128i res1; + + res1 = res; + + res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0); + res1 = _mm_srli_si128(res1, 8); + res = _mm_xor_si128(res, res1); + + res1 = res; + res = _mm_slli_si128(res, 4); + res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10); + res = _mm_xor_si128(res, res1); + + return res; +} + +static __rte_always_inline uint32_t +barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params) +{ + __m128i tmp0, tmp1; + + data64 = _mm_and_si128(data64, *(const __m128i *)mask2); + tmp0 = data64; + tmp1 = data64; + + data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0); + data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask, + 0x28); + + tmp1 = data64; + data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10); + data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96); + + return _mm_extract_epi32(data64, 2); +} + +static __rte_always_inline void +reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n, + const struct crc_vpclmulqdq_ctx *params) +{ + __m128i tmp, tmp1; + + tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1); + *fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10); + *fold = _mm_xor_si128(*fold, tmp); + tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]); + *fold = _mm_xor_si128(*fold, tmp1); + *n += 16; + *len -= 16; +} + +static __rte_always_inline uint32_t +crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc, + const struct crc_vpclmulqdq_ctx *params) +{ + __m128i res, d; + __m256i b; + __m512i temp, k; + __m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3; + __m512i fold0, fold1, fold2, fold3; + __mmask16 mask; + uint32_t n = 0; + int reduction = 0; + + /* Get CRC init value */ + b = _mm256_insert_epi32(_mm256_setzero_si256(), crc, 0); + temp = _mm512_inserti32x8(_mm512_setzero_si512(), b, 0); + + if (data_len > 255) { + fold0 = _mm512_loadu_si512((const __m512i *)data); + fold1 = _mm512_loadu_si512((const __m512i *)(data+64)); + fold2 = _mm512_loadu_si512((const __m512i *)(data+128)); + fold3 = _mm512_loadu_si512((const __m512i *)(data+192)); + fold0 = _mm512_xor_si512(fold0, temp); + + /* Main folding loop */ + k = params->rk1_rk2; + for (n = 256; (n + 256) <= data_len; n += 256) { + qw0 = _mm512_loadu_si512((const __m512i *)&data[n]); + qw1 = _mm512_loadu_si512((const __m512i *) + &(data[n+64])); + qw2 = _mm512_loadu_si512((const __m512i *) + &(data[n+128])); + qw3 = _mm512_loadu_si512((const __m512i *) + &(data[n+192])); + fold0 = crcr32_folding_round(qw0, k, fold0); + fold1 = crcr32_folding_round(qw1, k, fold1); + fold2 = crcr32_folding_round(qw2, k, fold2); + fold3 = crcr32_folding_round(qw3, k, fold3); + } + + /* 256 to 128 fold */ + k = params->rk3_rk4; + fold0 = crcr32_folding_round(fold2, k, fold0); + fold1 = crcr32_folding_round(fold3, k, fold1); + + res = crc32_fold_128(fold0, fold1, params); + + reduction = 240 - ((n+256)-data_len); + + while (reduction > 0) + reduction_loop(&res, &reduction, data, &n, + params); + + reduction += 16; + + if (n != data_len) + res = last_two_xmm(data, data_len, n, res, + params); + } else { + if (data_len > 31) { + res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); + d = _mm_loadu_si128((const __m128i *)data); + res = _mm_xor_si128(res, d); + n += 16; + + reduction = 240 - ((n+256)-data_len); + + while (reduction > 0) + reduction_loop(&res, &reduction, data, &n, + params); + + if (n != data_len) + res = last_two_xmm(data, data_len, n, res, + params); + } else if (data_len > 16) { + res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); + d = _mm_loadu_si128((const __m128i *)data); + res = _mm_xor_si128(res, d); + n += 16; + + if (n != data_len) + res = last_two_xmm(data, data_len, n, res, + params); + } else if (data_len == 16) { + res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); + d = _mm_loadu_si128((const __m128i *)data); + res = _mm_xor_si128(res, d); + } else { + res = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); + mask = byte_len_to_mask_table[data_len]; + d = _mm_maskz_loadu_epi8(mask, data); + res = _mm_xor_si128(res, d); + + if (data_len > 3) { + d = _mm_loadu_si128((const __m128i *) + &shf_table[data_len]); + res = _mm_shuffle_epi8(res, d); + } else if (data_len > 2) { + res = _mm_slli_si128(res, 5); + goto do_barrett_reduction; + } else if (data_len > 1) { + res = _mm_slli_si128(res, 6); + goto do_barrett_reduction; + } else if (data_len > 0) { + res = _mm_slli_si128(res, 7); + goto do_barrett_reduction; + } else { + /* zero length case */ + return crc; + } + } + } + + res = done_128(res, params); + +do_barrett_reduction: + n = barrett_reduction(res, params); + + return n; +} + +static void +crc32_load_init_constants(void) +{ + __m128i a; + /* fold constants */ + uint64_t c0 = 0x00000000e95c1271; + uint64_t c1 = 0x00000000ce3371cb; + uint64_t c2 = 0x00000000910eeec1; + uint64_t c3 = 0x0000000033fff533; + uint64_t c4 = 0x000000000cbec0ed; + uint64_t c5 = 0x0000000031f8303f; + uint64_t c6 = 0x0000000057c54819; + uint64_t c7 = 0x00000000df068dc2; + uint64_t c8 = 0x00000000ae0b5394; + uint64_t c9 = 0x000000001c279815; + uint64_t c10 = 0x000000001d9513d7; + uint64_t c11 = 0x000000008f352d95; + uint64_t c12 = 0x00000000af449247; + uint64_t c13 = 0x000000003db1ecdc; + uint64_t c14 = 0x0000000081256527; + uint64_t c15 = 0x00000000f1da05aa; + uint64_t c16 = 0x00000000ccaa009e; + uint64_t c17 = 0x00000000ae689191; + uint64_t c18 = 0x00000000ccaa009e; + uint64_t c19 = 0x00000000b8bc6765; + uint64_t c20 = 0x00000001f7011640; + uint64_t c21 = 0x00000001db710640; + + a = _mm_set_epi64x(c1, c0); + crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a); + + a = _mm_set_epi64x(c3, c2); + crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a); + + crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8, + c9, c10, c11); + crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15, + c16, c17, 0, 0); + crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16), + _mm_cvtsi64_m64(c17)); + + crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18), + _mm_cvtsi64_m64(c19)); + crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20), + _mm_cvtsi64_m64(c21)); +} + +static void +crc16_load_init_constants(void) +{ + __m128i a; + /* fold constants */ + uint64_t c0 = 0x0000000000009a19; + uint64_t c1 = 0x0000000000002df8; + uint64_t c2 = 0x00000000000068af; + uint64_t c3 = 0x000000000000b6c9; + uint64_t c4 = 0x000000000000c64f; + uint64_t c5 = 0x000000000000cd95; + uint64_t c6 = 0x000000000000d341; + uint64_t c7 = 0x000000000000b8f2; + uint64_t c8 = 0x0000000000000842; + uint64_t c9 = 0x000000000000b072; + uint64_t c10 = 0x00000000000047e3; + uint64_t c11 = 0x000000000000922d; + uint64_t c12 = 0x0000000000000e3a; + uint64_t c13 = 0x0000000000004d7a; + uint64_t c14 = 0x0000000000005b44; + uint64_t c15 = 0x0000000000007762; + uint64_t c16 = 0x00000000000081bf; + uint64_t c17 = 0x0000000000008e10; + uint64_t c18 = 0x00000000000081bf; + uint64_t c19 = 0x0000000000001cbb; + uint64_t c20 = 0x000000011c581910; + uint64_t c21 = 0x0000000000010810; + + a = _mm_set_epi64x(c1, c0); + crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a); + + a = _mm_set_epi64x(c3, c2); + crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a); + + crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8, + c9, c10, c11); + crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15, + c16, c17, 0, 0); + crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16), + _mm_cvtsi64_m64(c17)); + + crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18), + _mm_cvtsi64_m64(c19)); + crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20), + _mm_cvtsi64_m64(c21)); +} + +void +rte_net_crc_avx512_init(void) +{ + crc32_load_init_constants(); + crc16_load_init_constants(); + + /* + * Reset the register as following calculation may + * use other data types such as float, double, etc. + */ + _mm_empty(); +} + +uint32_t +rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len) +{ + /* return 16-bit CRC value */ + return (uint16_t)~crc32_eth_calc_vpclmulqdq(data, + data_len, + 0xffff, + &crc16_ccitt); +} + +uint32_t +rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len) +{ + /* return 32-bit CRC value */ + return ~crc32_eth_calc_vpclmulqdq(data, + data_len, + 0xffffffffUL, + &crc32_eth); +} diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c index 83dccbfba..fcf9cc0ef 100644 --- a/lib/librte_net/rte_net_crc.c +++ b/lib/librte_net/rte_net_crc.c @@ -37,6 +37,12 @@ static rte_net_crc_handler handlers_scalar[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_handler, }; +#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT +static rte_net_crc_handler handlers_avx512[] = { + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_avx512_handler, + [RTE_NET_CRC32_ETH] = rte_crc32_eth_avx512_handler, +}; +#endif #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT static rte_net_crc_handler handlers_sse42[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler, @@ -132,6 +138,19 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len) crc32_eth_lut); } +#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT +static uint8_t +avx512_vpclmulqdq_cpu_supported(void) +{ + return rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512DQ) && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_PCLMULQDQ) && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_VPCLMULQDQ); +} +#endif + #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT static uint8_t sse42_pclmulqdq_cpu_supported(void) @@ -153,6 +172,14 @@ rte_net_crc_set_alg(enum rte_net_crc_alg alg) { switch (alg) { #ifdef RTE_ARCH_X86_64 + case RTE_NET_CRC_AVX512: +#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT + if (avx512_vpclmulqdq_cpu_supported()) { + handlers = handlers_avx512; + break; + } +#endif + /* fall-through */ case RTE_NET_CRC_SSE42: #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT if (sse42_pclmulqdq_cpu_supported()) { @@ -206,6 +233,12 @@ RTE_INIT(rte_net_crc_init) rte_net_crc_sse42_init(); } #endif +#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT + if (avx512_vpclmulqdq_cpu_supported()) { + alg = RTE_NET_CRC_AVX512; + rte_net_crc_avx512_init(); + } +#endif #ifdef CC_ARM64_NEON_PMULL_SUPPORT if (neon_pmull_cpu_supported()) { alg = RTE_NET_CRC_NEON; diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h index 16e85ca97..72d3e10ff 100644 --- a/lib/librte_net/rte_net_crc.h +++ b/lib/librte_net/rte_net_crc.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation + * Copyright(c) 2017-2020 Intel Corporation */ #ifndef _RTE_NET_CRC_H_ @@ -23,6 +23,7 @@ enum rte_net_crc_alg { RTE_NET_CRC_SCALAR = 0, RTE_NET_CRC_SSE42, RTE_NET_CRC_NEON, + RTE_NET_CRC_AVX512, }; /** @@ -35,6 +36,7 @@ enum rte_net_crc_alg { * - RTE_NET_CRC_SCALAR * - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic) * - RTE_NET_CRC_NEON (Use ARM Neon intrinsic) + * - RTE_NET_CRC_AVX512 (Use 512-bit AVX intrinsic) */ void rte_net_crc_set_alg(enum rte_net_crc_alg alg); -- 2.12.3 ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2020-09-29 15:47 UTC | newest] Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2020-09-10 12:01 [dpdk-dev] [PATCH] net: add support for AVX512 when generating CRC Mairtin o Loingsigh 2020-09-10 12:27 ` Bruce Richardson 2020-09-10 12:52 ` O'loingsigh, Mairtin 2020-09-29 15:45 ` O'loingsigh, Mairtin 2020-09-11 9:57 ` De Lara Guarch, Pablo 2020-09-29 15:47 ` O'loingsigh, Mairtin 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 0/2] net: add CRC run-time checks and AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh 2020-09-29 15:41 ` O'loingsigh, Mairtin 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 1/2] net: add run-time architecture specific CRC selection Mairtin o Loingsigh 2020-09-29 15:12 ` [dpdk-dev] [PATCH v2 2/2] net: add support for AVX512/VPCLMULQDQ based CRC Mairtin o Loingsigh
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).