DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v3 0/4] hash: add SVE support for bulk key lookup
@ 2023-11-07 12:18 Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Yoan Picchi @ 2023-11-07 12:18 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: Nathan Brown, Ruifeng Wang, dev, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                   |   2 +
 app/test/test_hash.c       |  99 ++++++++++----
 lib/hash/rte_cuckoo_hash.c | 264 +++++++++++++++++++++++++++++--------
 lib/hash/rte_cuckoo_hash.h |   1 +
 4 files changed, 287 insertions(+), 79 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 1/4] hash: pack the hitmask for hash in bulk lookup
  2023-11-07 12:18 [PATCH v3 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2023-11-07 12:18 ` Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 2/4] hash: optimize compare signature for NEON Yoan Picchi
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2023-11-07 12:18 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: Nathan Brown, Ruifeng Wang, dev, Yoan Picchi

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3f5bab26a8..b9c49aa7f6 100644
--- a/.mailmap
+++ b/.mailmap
@@ -485,6 +485,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1602,6 +1603,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 19b23f2a97..2aa96eb862 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1850,8 +1850,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << i);
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << i);
+		}
+	}
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
@@ -1878,25 +1920,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 		/* Extract the even-index bits only */
 		*sec_hash_matches &= 0x5555;
 		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
+#endif /* defined(__SSE2__) */
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
 			*prim_hash_matches |=
@@ -1907,6 +1931,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 	}
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1921,18 +1947,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+			primary_bkt[i], secondary_bkt[i],
+			sig[i], h->sig_cmp_fn);
+#else
+		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+#endif
 
 		if (prim_hitmask[i]) {
 			uint32_t first_hit =
 					__builtin_ctzl(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1946,7 +1984,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		if (sec_hitmask[i]) {
 			uint32_t first_hit =
 					__builtin_ctzl(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1963,7 +2001,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		while (prim_hitmask[i]) {
 			uint32_t hit_index =
 					__builtin_ctzl(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1985,13 +2023,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 
 		while (sec_hitmask[i]) {
 			uint32_t hit_index =
 					__builtin_ctzl(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2014,7 +2052,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2069,6 +2107,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2082,14 +2126,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+#else
+			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+				primary_bkt[i], secondary_bkt[i],
+				sig[i], h->sig_cmp_fn);
+#endif
 
 			if (prim_hitmask[i]) {
 				uint32_t first_hit =
 						__builtin_ctzl(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2103,7 +2153,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			if (sec_hitmask[i]) {
 				uint32_t first_hit =
 						__builtin_ctzl(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2119,7 +2169,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2145,13 +2195,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 
 			while (sec_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2178,7 +2228,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 2/4] hash: optimize compare signature for NEON
  2023-11-07 12:18 [PATCH v3 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2023-11-07 12:18 ` Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2023-11-07 12:18 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: Nathan Brown, Ruifeng Wang, dev, Yoan Picchi

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 2aa96eb862..a4b907c45c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1864,19 +1864,17 @@ compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16x8_t vmat, x;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
 		}
 		break;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 3/4] test/hash: check bulk lookup of keys after collision
  2023-11-07 12:18 [PATCH v3 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2023-11-07 12:18 ` Yoan Picchi
  2023-11-07 12:18 ` [PATCH v3 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
  4 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2023-11-07 12:18 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: Nathan Brown, Ruifeng Wang, dev, Yoan Picchi, Harjot Singh

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..c4e7f8190e 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | 3 << 16;
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 4/4] hash: add SVE support for bulk key lookup
  2023-11-07 12:18 [PATCH v3 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (2 preceding siblings ...)
  2023-11-07 12:18 ` [PATCH v3 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2023-11-07 12:18 ` Yoan Picchi
  2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
  4 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2023-11-07 12:18 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: Nathan Brown, Ruifeng Wang, dev, Yoan Picchi, Harjot Singh

- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
 lib/hash/rte_cuckoo_hash.h |   1 +
 2 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a4b907c45c..61637d02eb 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -435,8 +435,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1853,37 +1856,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 #if defined(__ARM_NEON)
 
 static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
 	unsigned int i;
 
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, x;
+		uint16x8_t vmat, hit1, hit2;
 		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
+
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
+		}
+		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uin16_t lower_half = 0;
+				uin16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
 		}
 		break;
+#endif
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << i);
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << i);
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
@@ -1901,7 +1970,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1941,14 +2010,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	__hash_rw_reader_lock(h);
@@ -1956,18 +2029,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-			primary_bkt[i], secondary_bkt[i],
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					__builtin_ctzl(prim_hitmask[i])
+					__builtin_ctzl(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
@@ -1979,9 +2058,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					__builtin_ctzl(sec_hitmask[i])
+					__builtin_ctzl(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
@@ -1996,9 +2075,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					__builtin_ctzl(prim_hitmask[i])
+					__builtin_ctzl(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
@@ -2021,12 +2108,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					__builtin_ctzl(sec_hitmask[i])
+					__builtin_ctzl(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
@@ -2050,7 +2137,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2100,15 +2187,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	for (i = 0; i < num_keys; i++)
@@ -2125,18 +2215,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-				primary_bkt[i], secondary_bkt[i],
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						__builtin_ctzl(prim_hitmask[i])
+						__builtin_ctzl(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
@@ -2148,9 +2244,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						__builtin_ctzl(sec_hitmask[i])
+						__builtin_ctzl(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
@@ -2164,9 +2260,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						__builtin_ctzl(prim_hitmask[i])
+						__builtin_ctzl(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
@@ -2193,12 +2297,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						__builtin_ctzl(sec_hitmask[i])
+						__builtin_ctzl(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
@@ -2226,7 +2330,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index eb2644f74b..356ec2a69e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -148,6 +148,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v4 0/4] hash: add SVE support for bulk key lookup
  2023-11-07 12:18 [PATCH v3 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (3 preceding siblings ...)
  2023-11-07 12:18 ` [PATCH v3 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-02-26 17:01 ` Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  4 siblings, 4 replies; 10+ messages in thread
From: Yoan Picchi @ 2024-02-26 17:01 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

V3->V4:
  Rebase

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                   |   2 +
 app/test/test_hash.c       |  99 ++++++++++----
 lib/hash/rte_cuckoo_hash.c | 264 +++++++++++++++++++++++++++++--------
 lib/hash/rte_cuckoo_hash.h |   1 +
 4 files changed, 287 insertions(+), 79 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v4 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
@ 2024-02-26 17:02   ` Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2024-02-26 17:02 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.mailmap b/.mailmap
index 12d2875641..60500bbe36 100644
--- a/.mailmap
+++ b/.mailmap
@@ -492,6 +492,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1625,6 +1626,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0550165584 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1857,8 +1857,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << i);
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << i);
+		}
+	}
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
@@ -1885,25 +1927,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 		/* Extract the even-index bits only */
 		*sec_hash_matches &= 0x5555;
 		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
+#endif /* defined(__SSE2__) */
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
 			*prim_hash_matches |=
@@ -1914,6 +1938,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 	}
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1928,18 +1954,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+			primary_bkt[i], secondary_bkt[i],
+			sig[i], h->sig_cmp_fn);
+#else
+		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+#endif
 
 		if (prim_hitmask[i]) {
 			uint32_t first_hit =
 					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1953,7 +1991,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		if (sec_hitmask[i]) {
 			uint32_t first_hit =
 					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1970,7 +2008,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		while (prim_hitmask[i]) {
 			uint32_t hit_index =
 					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +2030,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 
 		while (sec_hitmask[i]) {
 			uint32_t hit_index =
 					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2059,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2076,6 +2114,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2133,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+#else
+			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+				primary_bkt[i], secondary_bkt[i],
+				sig[i], h->sig_cmp_fn);
+#endif
 
 			if (prim_hitmask[i]) {
 				uint32_t first_hit =
 						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2110,7 +2160,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			if (sec_hitmask[i]) {
 				uint32_t first_hit =
 						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2126,7 +2176,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2202,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 
 			while (sec_hitmask[i]) {
 				uint32_t hit_index =
 						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2235,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v4 2/4] hash: optimize compare signature for NEON
  2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-02-26 17:02   ` Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2024-02-26 17:02 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0550165584..a07dd3a28d 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1871,19 +1871,17 @@ compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16x8_t vmat, x;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
 		}
 		break;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v4 3/4] test/hash: check bulk lookup of keys after collision
  2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-02-26 17:02   ` Yoan Picchi
  2024-02-26 17:02   ` [PATCH v4 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2024-02-26 17:02 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..c4e7f8190e 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | 3 << 16;
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v4 4/4] hash: add SVE support for bulk key lookup
  2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-02-26 17:02   ` [PATCH v4 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-02-26 17:02   ` Yoan Picchi
  3 siblings, 0 replies; 10+ messages in thread
From: Yoan Picchi @ 2024-02-26 17:02 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang

- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>

Change-Id: Ief614e2f90fd85484195b8116bfbf56d6dfec71e
---
 lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
 lib/hash/rte_cuckoo_hash.h |   1 +
 2 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a07dd3a28d..231d6d6ded 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 #if defined(__ARM_NEON)
 
 static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
 	unsigned int i;
 
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, x;
+		uint16x8_t vmat, hit1, hit2;
 		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
+
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
+		}
+		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
 		}
 		break;
+#endif
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << i);
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << i);
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
@@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	__hash_rw_reader_lock(h);
@@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-			primary_bkt[i], secondary_bkt[i],
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
+					rte_ctz32(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
@@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
+					rte_ctz32(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
@@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
+					rte_ctz32(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
@@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
+					rte_ctz32(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
@@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	for (i = 0; i < num_keys; i++)
@@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-				primary_bkt[i], secondary_bkt[i],
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
+						rte_ctz32(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
@@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
+						rte_ctz32(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
@@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
+						rte_ctz32(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
@@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
+						rte_ctz32(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
@@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index 8ea793c66e..ed18e1f41e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-02-27  6:03 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-07 12:18 [PATCH v3 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
2023-11-07 12:18 ` [PATCH v3 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2023-11-07 12:18 ` [PATCH v3 2/4] hash: optimize compare signature for NEON Yoan Picchi
2023-11-07 12:18 ` [PATCH v3 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2023-11-07 12:18 ` [PATCH v3 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
2024-02-26 17:01 ` [PATCH v4 0/4] " Yoan Picchi
2024-02-26 17:02   ` [PATCH v4 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2024-02-26 17:02   ` [PATCH v4 2/4] hash: optimize compare signature for NEON Yoan Picchi
2024-02-26 17:02   ` [PATCH v4 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2024-02-26 17:02   ` [PATCH v4 4/4] hash: add SVE support for bulk key lookup Yoan Picchi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).