From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 0A5FB4559B;
	Fri,  5 Jul 2024 19:46:07 +0200 (CEST)
Received: from mails.dpdk.org (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id 68A3842FDF;
	Fri,  5 Jul 2024 19:45:49 +0200 (CEST)
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
 by mails.dpdk.org (Postfix) with ESMTP id 3F56842F7F
 for <dev@dpdk.org>; Fri,  5 Jul 2024 19:45:33 +0200 (CEST)
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
 by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id BC99B150C;
 Fri,  5 Jul 2024 10:45:57 -0700 (PDT)
Received: from ampere-altra-2-3.austin.arm.com
 (ampere-altra-2-3.austin.arm.com [10.118.14.97])
 by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 9E11D3F73B;
 Fri,  5 Jul 2024 10:45:32 -0700 (PDT)
From: Yoan Picchi <yoan.picchi@arm.com>
To: Yipeng Wang <yipeng1.wang@intel.com>,
 Sameh Gobriel <sameh.gobriel@intel.com>,
 Bruce Richardson <bruce.richardson@intel.com>,
 Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Cc: dev@dpdk.org, nd@arm.com, Yoan Picchi <yoan.picchi@arm.com>,
 Ruifeng Wang <ruifeng.wang@arm.com>, Nathan Brown <nathan.brown@arm.com>
Subject: [PATCH v11 4/7] hash: pack the hitmask for hash in bulk lookup
Date: Fri,  5 Jul 2024 17:45:23 +0000
Message-Id: <20240705174526.3035295-5-yoan.picchi@arm.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20240705174526.3035295-1-yoan.picchi@arm.com>
References: <20231020165159.1649282-1-yoan.picchi@arm.com>
 <20240705174526.3035295-1-yoan.picchi@arm.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h     |  47 ++++----
 lib/hash/compare_signatures_generic_pvt.h |  31 +++---
 lib/hash/compare_signatures_x86_pvt.h     |   9 +-
 lib/hash/rte_cuckoo_hash.c                | 124 ++++++++++++++++------
 4 files changed, 145 insertions(+), 66 deletions(-)

diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index 74b3286c95..0fc657c49b 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -6,48 +6,57 @@
 #ifndef _COMPARE_SIGNATURE_ARM_PVT_H_
 #define _COMPARE_SIGNATURE_ARM_PVT_H_
 
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-	unsigned int i;
 
-	/* For match mask the first bit of every two bits indicates the match */
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+		"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
 		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
 
 		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
 		}
 		break;
 #endif
 	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
diff --git a/lib/hash/compare_signatures_generic_pvt.h b/lib/hash/compare_signatures_generic_pvt.h
index 43587adcef..1d065d4c28 100644
--- a/lib/hash/compare_signatures_generic_pvt.h
+++ b/lib/hash/compare_signatures_generic_pvt.h
@@ -6,27 +6,34 @@
 #ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_
 #define _COMPARE_SIGNATURE_GENERIC_PVT_H_
 
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
+			__rte_unused enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-		*prim_hash_matches |=
-			((sig == prim_bkt->sig_current[i]) << (i << 1));
-		*sec_hash_matches |=
-			((sig == sec_bkt->sig_current[i]) << (i << 1));
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+			"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+		*hitmask_buffer |= ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 	}
 }
 
diff --git a/lib/hash/compare_signatures_x86_pvt.h b/lib/hash/compare_signatures_x86_pvt.h
index f77b37f1cd..03e9c44e53 100644
--- a/lib/hash/compare_signatures_x86_pvt.h
+++ b/lib/hash/compare_signatures_x86_pvt.h
@@ -6,14 +6,21 @@
 #ifndef _COMPARE_SIGNATURE_X86_PVT_H_
 #define _COMPARE_SIGNATURE_X86_PVT_H_
 
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 0
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 739f7927b8..187918a05a 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1908,22 +1908,41 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1934,10 +1953,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1951,10 +1970,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1976,13 +2003,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2005,7 +2032,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2055,11 +2082,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2073,14 +2109,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2091,10 +2139,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2155,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2136,13 +2192,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2169,7 +2225,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.34.1