From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 0667D43CF7; Tue, 19 Mar 2024 14:09:49 +0100 (CET) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 8263F402D1; Tue, 19 Mar 2024 14:09:48 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by mails.dpdk.org (Postfix) with ESMTP id 5A46B40298 for ; Tue, 19 Mar 2024 14:09:47 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id A7B82106F; Tue, 19 Mar 2024 06:10:21 -0700 (PDT) Received: from [10.1.30.128] (e125442.arm.com [10.1.30.128]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id AC9233F762; Tue, 19 Mar 2024 06:09:44 -0700 (PDT) Message-ID: <169368f3-c385-4591-ab3c-531c8918eba3@foss.arm.com> Date: Tue, 19 Mar 2024 13:09:42 +0000 MIME-Version: 1.0 User-Agent: Mozilla Thunderbird Subject: Re: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Content-Language: en-US To: Konstantin Ananyev , Yoan Picchi , Thomas Monjalon , Yipeng Wang , Sameh Gobriel , Bruce Richardson , Vladimir Medvedkin Cc: "dev@dpdk.org" , "nd@arm.com" , Ruifeng Wang , Nathan Brown References: <20231020165159.1649282-1-yoan.picchi@arm.com> <20240312154215.802374-1-yoan.picchi@arm.com> <20240312154215.802374-2-yoan.picchi@arm.com> <28cff0e5ea404051acdcb71c567d9d7c@huawei.com> From: Yoan Picchi In-Reply-To: <28cff0e5ea404051acdcb71c567d9d7c@huawei.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org On 3/19/24 10:41, Konstantin Ananyev wrote: > > Hi, > >> Current hitmask includes padding due to Intel's SIMD >> implementation detail. This patch allows non Intel SIMD >> implementations to benefit from a dense hitmask. >> In addition, the new dense hitmask interweave the primary >> and secondary matches which allow a better cache usage and >> enable future improvements for the SIMD implementations >> >> Signed-off-by: Yoan Picchi >> Reviewed-by: Ruifeng Wang >> Reviewed-by: Nathan Brown >> --- >> .mailmap | 2 + >> lib/hash/arch/arm/compare_signatures.h | 61 +++++++ >> lib/hash/arch/common/compare_signatures.h | 38 +++++ >> lib/hash/arch/x86/compare_signatures.h | 53 ++++++ >> lib/hash/rte_cuckoo_hash.c | 192 ++++++++++++---------- >> 5 files changed, 255 insertions(+), 91 deletions(-) >> create mode 100644 lib/hash/arch/arm/compare_signatures.h >> create mode 100644 lib/hash/arch/common/compare_signatures.h >> create mode 100644 lib/hash/arch/x86/compare_signatures.h >> >> diff --git a/.mailmap b/.mailmap >> index 66ebc20666..00b50414d3 100644 >> --- a/.mailmap >> +++ b/.mailmap >> @@ -494,6 +494,7 @@ Hari Kumar Vemula >> Harini Ramakrishnan >> Hariprasad Govindharajan >> Harish Patil >> +Harjot Singh >> Harman Kalra >> Harneet Singh >> Harold Huang >> @@ -1633,6 +1634,7 @@ Yixue Wang >> Yi Yang >> Yi Zhang >> Yoann Desmouceaux >> +Yoan Picchi >> Yogesh Jangra >> Yogev Chaimovich >> Yongjie Gu >> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h >> new file mode 100644 >> index 0000000000..1af6ba8190 >> --- /dev/null >> +++ b/lib/hash/arch/arm/compare_signatures.h >> @@ -0,0 +1,61 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(c) 2010-2016 Intel Corporation >> + * Copyright(c) 2018-2024 Arm Limited >> + */ >> + >> +/* >> + * Arm's version uses a densely packed hitmask buffer: >> + * Every bit is in use. >> + */ >> + >> +#include >> +#include >> +#include >> +#include "rte_cuckoo_hash.h" >> + >> +#define DENSE_HASH_BULK_LOOKUP 1 >> + >> +static inline void >> +compare_signatures_dense(uint16_t *hitmask_buffer, >> + const uint16_t *prim_bucket_sigs, >> + const uint16_t *sec_bucket_sigs, >> + uint16_t sig, >> + enum rte_hash_sig_compare_function sig_cmp_fn) >> +{ >> + >> + static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8), >> + "The hitmask must be exactly wide enough to accept the whole hitmask if it is dense"); >> + >> + /* For match mask every bits indicates the match */ >> + switch (sig_cmp_fn) { >> +#if RTE_HASH_BUCKET_ENTRIES <= 8 >> + case RTE_HASH_COMPARE_NEON: { >> + uint16x8_t vmat, vsig, x; >> + int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; >> + uint16_t low, high; >> + >> + vsig = vld1q_dup_u16((uint16_t const *)&sig); >> + /* Compare all signatures in the primary bucket */ >> + vmat = vceqq_u16(vsig, >> + vld1q_u16((uint16_t const *)prim_bucket_sigs)); >> + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); >> + low = (uint16_t)(vaddvq_u16(x)); >> + /* Compare all signatures in the secondary bucket */ >> + vmat = vceqq_u16(vsig, >> + vld1q_u16((uint16_t const *)sec_bucket_sigs)); >> + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); >> + high = (uint16_t)(vaddvq_u16(x)); >> + *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES; >> + >> + } >> + break; >> +#endif >> + default: >> + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { >> + *hitmask_buffer |= >> + ((sig == prim_bucket_sigs[i]) << i); >> + *hitmask_buffer |= >> + ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES; >> + } >> + } >> +} >> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h >> new file mode 100644 >> index 0000000000..dcf9444032 >> --- /dev/null >> +++ b/lib/hash/arch/common/compare_signatures.h >> @@ -0,0 +1,38 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(c) 2010-2016 Intel Corporation >> + * Copyright(c) 2018-2024 Arm Limited >> + */ >> + >> +/* >> + * The generic version could use either a dense or sparsely packed hitmask buffer, >> + * but the dense one is slightly faster. >> + */ >> + >> +#include >> +#include >> +#include >> +#include "rte_cuckoo_hash.h" >> + >> +#define DENSE_HASH_BULK_LOOKUP 1 >> + >> +static inline void >> +compare_signatures_dense(uint16_t *hitmask_buffer, >> + const uint16_t *prim_bucket_sigs, >> + const uint16_t *sec_bucket_sigs, >> + uint16_t sig, >> + enum rte_hash_sig_compare_function sig_cmp_fn) >> +{ >> + (void) sig_cmp_fn; >> + >> + static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8), >> + "The hitmask must be exactly wide enough to accept the whole hitmask if it is dense"); >> + >> + /* For match mask every bits indicates the match */ >> + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { >> + *hitmask_buffer |= >> + ((sig == prim_bucket_sigs[i]) << i); >> + *hitmask_buffer |= >> + ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES; >> + } >> + >> +} > > Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way. > One question I have - does it mean that now for x86 we always use 'sparse' while for all other > ARM and non-ARM platforms we switch to 'dense'? Yes it does. x86 support only the sparse method (the legacy one). Arm and generic code could support both dense and sparse. The reason I made them use the dense method is because it was slightly faster in my tests. (no need to add padding and shifts amongst other benefit.) > >> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h >> new file mode 100644 >> index 0000000000..7eec499e1f >> --- /dev/null >> +++ b/lib/hash/arch/x86/compare_signatures.h >> @@ -0,0 +1,53 @@ >> +/* SPDX-License-Identifier: BSD-3-Clause >> + * Copyright(c) 2010-2016 Intel Corporation >> + * Copyright(c) 2018-2024 Arm Limited >> + */ >> + >> +/* >> + * x86's version uses a sparsely packed hitmask buffer: >> + * Every other bit is padding. >> + */ >> + >> +#include >> +#include >> +#include >> +#include "rte_cuckoo_hash.h" >> + >> +#define DENSE_HASH_BULK_LOOKUP 0 >> + >> +static inline void >> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, >> + const struct rte_hash_bucket *prim_bkt, >> + const struct rte_hash_bucket *sec_bkt, >> + uint16_t sig, >> + enum rte_hash_sig_compare_function sig_cmp_fn) >> +{ >> + /* For match mask the first bit of every two bits indicates the match */ >> + switch (sig_cmp_fn) { >> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8 >> + case RTE_HASH_COMPARE_SSE: >> + /* Compare all signatures in the bucket */ >> + *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( >> + _mm_load_si128( >> + (__m128i const *)prim_bkt->sig_current), >> + _mm_set1_epi16(sig))); >> + /* Extract the even-index bits only */ >> + *prim_hash_matches &= 0x5555; >> + /* Compare all signatures in the bucket */ >> + *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( >> + _mm_load_si128( >> + (__m128i const *)sec_bkt->sig_current), >> + _mm_set1_epi16(sig))); >> + /* Extract the even-index bits only */ >> + *sec_hash_matches &= 0x5555; >> + break; >> +#endif /* defined(__SSE2__) */ >> + default: >> + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { >> + *prim_hash_matches |= >> + ((sig == prim_bkt->sig_current[i]) << (i << 1)); >> + *sec_hash_matches |= >> + ((sig == sec_bkt->sig_current[i]) << (i << 1)); >> + } >> + } >> +}