From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id 91716A0526; Wed, 8 Jul 2020 21:57:44 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 72B4E1DBC2; Wed, 8 Jul 2020 21:57:44 +0200 (CEST) Received: from mga07.intel.com (mga07.intel.com [134.134.136.100]) by dpdk.org (Postfix) with ESMTP id C8E6A1D942 for ; Wed, 8 Jul 2020 21:57:42 +0200 (CEST) IronPort-SDR: Lm/U52C5rJIzHh2mi2/ZNyUVqQoO+c/2q2cb0vLNiUWREQAYlefA025/X3V4bdgMfw+eFXzkHF DwwcbEXvL8uA== X-IronPort-AV: E=McAfee;i="6000,8403,9676"; a="212814133" X-IronPort-AV: E=Sophos;i="5.75,329,1589266800"; d="scan'208";a="212814133" X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga004.fm.intel.com ([10.253.24.48]) by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Jul 2020 12:57:41 -0700 IronPort-SDR: uTnPv20ezDcPZ1kRrfqedIJFMR8ELmMJLLQ+xLj1yZalcicWO9cdK+wVoVTnI/JjEBsJ+KTcX1 hs31kBphM7fw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.75,329,1589266800"; d="scan'208";a="306152292" Received: from vmedvedk-mobl.ger.corp.intel.com (HELO [10.213.247.70]) ([10.213.247.70]) by fmsmga004.fm.intel.com with ESMTP; 08 Jul 2020 12:57:38 -0700 To: "Ananyev, Konstantin" , "dev@dpdk.org" Cc: "Richardson, Bruce" References: From: "Medvedkin, Vladimir" Message-ID: <5a79385e-6156-40bd-3ac1-dfeb1f274575@intel.com> Date: Wed, 8 Jul 2020 20:57:37 +0100 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Thunderbird/68.10.0 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit Subject: Re: [dpdk-dev] [PATCH v3 4/8] fib: introduce AVX512 lookup X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" On 24/06/2020 14:18, Ananyev, Konstantin wrote: > >> Add new lookup implementation for DIR24_8 algorithm using >> AVX512 instruction set >> >> Signed-off-by: Vladimir Medvedkin >> --- >> lib/librte_fib/Makefile | 14 ++++ >> lib/librte_fib/dir24_8.c | 24 ++++++ >> lib/librte_fib/dir24_8_avx512.c | 165 ++++++++++++++++++++++++++++++++++++++++ >> lib/librte_fib/dir24_8_avx512.h | 24 ++++++ >> lib/librte_fib/meson.build | 11 +++ >> lib/librte_fib/rte_fib.h | 3 +- >> 6 files changed, 240 insertions(+), 1 deletion(-) >> create mode 100644 lib/librte_fib/dir24_8_avx512.c >> create mode 100644 lib/librte_fib/dir24_8_avx512.h >> >> diff --git a/lib/librte_fib/Makefile b/lib/librte_fib/Makefile >> index 1dd2a49..3958da1 100644 >> --- a/lib/librte_fib/Makefile >> +++ b/lib/librte_fib/Makefile >> @@ -19,4 +19,18 @@ SRCS-$(CONFIG_RTE_LIBRTE_FIB) := rte_fib.c rte_fib6.c dir24_8.c trie.c >> # install this header file >> SYMLINK-$(CONFIG_RTE_LIBRTE_FIB)-include := rte_fib.h rte_fib6.h >> >> +CC_AVX512F_SUPPORT=$(shell $(CC) -mavx512f -dM -E - &1 | \ >> +grep -q __AVX512F__ && echo 1) >> + >> +CC_AVX512DQ_SUPPORT=$(shell $(CC) -mavx512dq -dM -E - &1 | \ >> +grep -q __AVX512DQ__ && echo 1) >> + >> +ifeq ($(CC_AVX512F_SUPPORT), 1) >> +ifeq ($(CC_AVX512DQ_SUPPORT), 1) >> +SRCS-$(CONFIG_RTE_LIBRTE_FIB) += dir24_8_avx512.c >> +CFLAGS_dir24_8_avx512.o += -mavx512f >> +CFLAGS_dir24_8_avx512.o += -mavx512dq >> +CFLAGS_dir24_8.o += -DCC_DIR24_8_AVX512_SUPPORT >> +endif >> +endif >> include $(RTE_SDK)/mk/rte.lib.mk >> diff --git a/lib/librte_fib/dir24_8.c b/lib/librte_fib/dir24_8.c >> index 9d74653..0a1c53f 100644 >> --- a/lib/librte_fib/dir24_8.c >> +++ b/lib/librte_fib/dir24_8.c >> @@ -18,6 +18,12 @@ >> #include >> #include "dir24_8.h" >> >> +#ifdef CC_DIR24_8_AVX512_SUPPORT >> + >> +#include "dir24_8_avx512.h" >> + >> +#endif /* CC_DIR24_8_AVX512_SUPPORT */ >> + >> #define DIR24_8_NAMESIZE64 >> >> #define ROUNDUP(x, y) RTE_ALIGN_CEIL(x, (1 << (32 - y))) >> @@ -62,6 +68,24 @@ dir24_8_get_lookup_fn(void *p, enum rte_fib_dir24_8_lookup_type type) >> } >> case RTE_FIB_DIR24_8_SCALAR_UNI: >> return dir24_8_lookup_bulk_uni; >> +#ifdef CC_DIR24_8_AVX512_SUPPORT >> +case RTE_FIB_DIR24_8_VECTOR: >> +if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) <= 0) >> +return NULL; >> + >> +switch (nh_sz) { >> +case RTE_FIB_DIR24_8_1B: >> +return rte_dir24_8_vec_lookup_bulk_1b; >> +case RTE_FIB_DIR24_8_2B: >> +return rte_dir24_8_vec_lookup_bulk_2b; >> +case RTE_FIB_DIR24_8_4B: >> +return rte_dir24_8_vec_lookup_bulk_4b; >> +case RTE_FIB_DIR24_8_8B: >> +return rte_dir24_8_vec_lookup_bulk_8b; >> +default: >> +return NULL; >> +} >> +#endif >> default: >> return NULL; >> } >> diff --git a/lib/librte_fib/dir24_8_avx512.c b/lib/librte_fib/dir24_8_avx512.c >> new file mode 100644 >> index 0000000..43dba28 >> --- /dev/null >> +++ b/lib/librte_fib/dir24_8_avx512.c >> @@ -0,0 +1,165 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(c) 2020 Intel Corporation >> + */ >> + >> +#include >> +#include >> + >> +#include "dir24_8.h" >> +#include "dir24_8_avx512.h" >> + >> +static __rte_always_inline void >> +dir24_8_vec_lookup_x16(void *p, const uint32_t *ips, >> +uint64_t *next_hops, int size) >> +{ >> +struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; >> +__mmask16 msk_ext; >> +__mmask16 exp_msk = 0x5555; >> +__m512i ip_vec, idxes, res, bytes; >> +const __m512i zero = _mm512_set1_epi32(0); >> +const __m512i lsb = _mm512_set1_epi32(1); >> +const __m512i lsbyte_msk = _mm512_set1_epi32(0xff); >> +__m512i tmp1, tmp2, res_msk; >> +__m256i tmp256; >> +/* used to mask gather values if size is 1/2 (8/16 bit next hops) */ >> +if (size == sizeof(uint8_t)) >> +res_msk = _mm512_set1_epi32(UINT8_MAX); >> +else if (size == sizeof(uint16_t)) >> +res_msk = _mm512_set1_epi32(UINT16_MAX); >> + >> +ip_vec = _mm512_loadu_si512(ips); >> +/* mask 24 most significant bits */ >> +idxes = _mm512_srli_epi32(ip_vec, 8); >> + >> +/** >> + * lookup in tbl24 >> + * Put it inside branch to make compiler happy with -O0 >> + */ >> +if (size == sizeof(uint8_t)) { >> +res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1); >> +res = _mm512_and_epi32(res, res_msk); >> +} else if (size == sizeof(uint16_t)) { >> +res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2); >> +res = _mm512_and_epi32(res, res_msk); >> +} else >> +res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4); >> + >> +/* get extended entries indexes */ >> +msk_ext = _mm512_test_epi32_mask(res, lsb); >> + >> +if (msk_ext != 0) { >> +idxes = _mm512_srli_epi32(res, 1); >> +idxes = _mm512_slli_epi32(idxes, 8); >> +bytes = _mm512_and_epi32(ip_vec, lsbyte_msk); >> +idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes); >> +if (size == sizeof(uint8_t)) { >> +idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, >> +idxes, (const int *)dp->tbl8, 1); >> +idxes = _mm512_and_epi32(idxes, res_msk); >> +} else if (size == sizeof(uint16_t)) { >> +idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, >> +idxes, (const int *)dp->tbl8, 2); >> +idxes = _mm512_and_epi32(idxes, res_msk); >> +} else >> +idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, >> +idxes, (const int *)dp->tbl8, 4); >> + >> +res = _mm512_mask_blend_epi32(msk_ext, res, idxes); >> +} >> + >> +res = _mm512_srli_epi32(res, 1); >> +tmp1 = _mm512_maskz_expand_epi32(exp_msk, res); >> +tmp256 = _mm512_extracti32x8_epi32(res, 1); >> +tmp2 = _mm512_maskz_expand_epi32(exp_msk, >> +_mm512_castsi256_si512(tmp256)); >> +_mm512_storeu_si512(next_hops, tmp1); >> +_mm512_storeu_si512(next_hops + 8, tmp2); >> +} >> + >> +static __rte_always_inline void >> +dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips, >> +uint64_t *next_hops) >> +{ >> +struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; >> +const __m512i zero = _mm512_set1_epi32(0); >> +const __m512i lsbyte_msk = _mm512_set1_epi64(0xff); >> +const __m512i lsb = _mm512_set1_epi64(1); >> +__m512i res, idxes, bytes; >> +__m256i idxes_256, ip_vec; >> +__mmask8 msk_ext; >> + >> +ip_vec = _mm256_loadu_si256((const void *)ips); >> +/* mask 24 most significant bits */ >> +idxes_256 = _mm256_srli_epi32(ip_vec, 8); >> + >> +/* lookup in tbl24 */ >> +res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8); >> + >> +/* get extended entries indexes */ >> +msk_ext = _mm512_test_epi64_mask(res, lsb); >> + >> +if (msk_ext != 0) { >> +bytes = _mm512_cvtepi32_epi64(ip_vec); >> +idxes = _mm512_srli_epi64(res, 1); >> +idxes = _mm512_slli_epi64(idxes, 8); >> +bytes = _mm512_and_epi64(bytes, lsbyte_msk); >> +idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes); >> +idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes, >> +(const void *)dp->tbl8, 8); >> + >> +res = _mm512_mask_blend_epi64(msk_ext, res, idxes); >> +} >> + >> +res = _mm512_srli_epi64(res, 1); >> +_mm512_storeu_si512(next_hops, res); >> +} >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n) >> +{ >> +uint32_t i; >> +for (i = 0; i < (n / 16); i++) >> +dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, >> +sizeof(uint8_t)); >> + > > Just curious: if for reminder, instead of calling scalar lookup, > Introduce a masked version of avx512 lookup - would it be slower? As was discussed offline, I tried, and it is slower than using scalar lookup for reminder. > >> +dir24_8_lookup_bulk_1b(p, ips + i * 16, next_hops + i * 16, >> +n - i * 16); >> +} >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n) >> +{ >> +uint32_t i; >> +for (i = 0; i < (n / 16); i++) >> +dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, >> +sizeof(uint16_t)); >> + >> +dir24_8_lookup_bulk_2b(p, ips + i * 16, next_hops + i * 16, >> +n - i * 16); >> +} >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n) >> +{ >> +uint32_t i; >> +for (i = 0; i < (n / 16); i++) >> +dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, >> +sizeof(uint32_t)); >> + >> +dir24_8_lookup_bulk_4b(p, ips + i * 16, next_hops + i * 16, >> +n - i * 16); >> +} >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n) >> +{ >> +uint32_t i; >> +for (i = 0; i < (n / 8); i++) >> +dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8); >> + >> +dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8); >> +} >> diff --git a/lib/librte_fib/dir24_8_avx512.h b/lib/librte_fib/dir24_8_avx512.h >> new file mode 100644 >> index 0000000..1d3c2b9 >> --- /dev/null >> +++ b/lib/librte_fib/dir24_8_avx512.h >> @@ -0,0 +1,24 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(c) 2020 Intel Corporation >> + */ >> + >> +#ifndef _DIR248_AVX512_H_ >> +#define _DIR248_AVX512_H_ >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n); >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n); >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n); >> + >> +void >> +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips, >> +uint64_t *next_hops, const unsigned int n); >> + >> +#endif /* _DIR248_AVX512_H_ */ >> diff --git a/lib/librte_fib/meson.build b/lib/librte_fib/meson.build >> index 771828f..0963f3c 100644 >> --- a/lib/librte_fib/meson.build >> +++ b/lib/librte_fib/meson.build >> @@ -5,3 +5,14 @@ >> sources = files('rte_fib.c', 'rte_fib6.c', 'dir24_8.c', 'trie.c') >> headers = files('rte_fib.h', 'rte_fib6.h') >> deps += ['rib'] >> + >> +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f') >> +if cc.has_argument('-mavx512dq') >> +dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp', >> +'dir24_8_avx512.c', >> +dependencies: static_rte_eal, >> +c_args: cflags + ['-mavx512f'] + ['-mavx512dq']) >> +objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c') >> +cflags += '-DCC_DIR24_8_AVX512_SUPPORT' >> +endif >> +endif >> diff --git a/lib/librte_fib/rte_fib.h b/lib/librte_fib/rte_fib.h >> index db35685..2919d13 100644 >> --- a/lib/librte_fib/rte_fib.h >> +++ b/lib/librte_fib/rte_fib.h >> @@ -54,7 +54,8 @@ enum rte_fib_dir24_8_nh_sz { >> enum rte_fib_dir24_8_lookup_type { >> RTE_FIB_DIR24_8_SCALAR_MACRO, >> RTE_FIB_DIR24_8_SCALAR_INLINE, >> -RTE_FIB_DIR24_8_SCALAR_UNI >> +RTE_FIB_DIR24_8_SCALAR_UNI, >> +RTE_FIB_DIR24_8_VECTOR >> }; >> >> /** FIB configuration structure */ >> -- > > Acked-by: Konstantin Ananyev > >> 2.7.4 > -- Regards, Vladimir