DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v2 0/4] hash: add SVE support for bulk key lookup
@ 2023-10-20 16:51 Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                   ` (8 more replies)
  0 siblings, 9 replies; 43+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  Cc: dev, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                   |   2 +
 app/test/test_hash.c       |  99 ++++++++++----
 lib/hash/rte_cuckoo_hash.c | 264 +++++++++++++++++++++++++++++--------
 lib/hash/rte_cuckoo_hash.h |   1 +
 4 files changed, 287 insertions(+), 79 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 2/4] hash: optimize compare signature for NEON Yoan Picchi
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, Yoan Picchi

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.mailmap b/.mailmap
index 3f5bab26a8..b9c49aa7f6 100644
--- a/.mailmap
+++ b/.mailmap
@@ -485,6 +485,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1602,6 +1603,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 19b23f2a97..2aa96eb862 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1850,8 +1850,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << i);
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << i);
+		}
+	}
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
@@ -1878,25 +1920,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 		/* Extract the even-index bits only */
 		*sec_hash_matches &= 0x5555;
 		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
+#endif /* defined(__SSE2__) */
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
 			*prim_hash_matches |=
@@ -1907,6 +1931,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 	}
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1921,18 +1947,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+			primary_bkt[i], secondary_bkt[i],
+			sig[i], h->sig_cmp_fn);
+#else
+		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+#endif
 
 		if (prim_hitmask[i]) {
 			uint32_t first_hit =
 					__builtin_ctzl(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1946,7 +1984,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		if (sec_hitmask[i]) {
 			uint32_t first_hit =
 					__builtin_ctzl(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1963,7 +2001,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		while (prim_hitmask[i]) {
 			uint32_t hit_index =
 					__builtin_ctzl(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1985,13 +2023,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 
 		while (sec_hitmask[i]) {
 			uint32_t hit_index =
 					__builtin_ctzl(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2014,7 +2052,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2069,6 +2107,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2082,14 +2126,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+#else
+			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+				primary_bkt[i], secondary_bkt[i],
+				sig[i], h->sig_cmp_fn);
+#endif
 
 			if (prim_hitmask[i]) {
 				uint32_t first_hit =
 						__builtin_ctzl(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2103,7 +2153,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			if (sec_hitmask[i]) {
 				uint32_t first_hit =
 						__builtin_ctzl(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2119,7 +2169,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2145,13 +2195,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 
 			while (sec_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2178,7 +2228,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 2/4] hash: optimize compare signature for NEON
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, Yoan Picchi

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 2aa96eb862..a4b907c45c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1864,19 +1864,17 @@ compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16x8_t vmat, x;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
 		}
 		break;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, Yoan Picchi, Harjot Singh

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..b6e22c5ecc 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | 3 << 16;
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	if (ret == 0)
+		for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+			print_key_info("Blk_Lkp", key_array[i], pos[i]);
+			RETURN_IF_ERROR(pos[i] != expected_pos[i],
+					"failed to find key (pos[%u]=%d)", i, pos[i]);
+		}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	if (ret == 0)
+		for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+			print_key_info("Blk_Lkp", key_array[i], pos[i]);
+			RETURN_IF_ERROR(pos[i] != -ENOENT,
+					"failed to find key (pos[%u]=%d)", i, pos[i]);
+		}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 4/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (2 preceding siblings ...)
  2023-10-20 16:51 ` [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, Yoan Picchi, Harjot Singh

- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~3% slower than optimized NEON for N2 processor.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
 lib/hash/rte_cuckoo_hash.h |   1 +
 2 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a4b907c45c..cda39d1441 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -435,8 +435,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1853,37 +1856,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 #if defined(__ARM_NEON)
 
 static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
 	unsigned int i;
 
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
+#if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, x;
+		uint16x8_t vmat, hit1, hit2;
 		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
+
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
+		}
+		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				int lower_half = 0;
+				int upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				*(hitmask_buffer+(i/8)) = lower_half | upper_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
 		}
 		break;
+#endif
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << i);
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << i);
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
@@ -1901,7 +1970,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1941,14 +2010,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	__hash_rw_reader_lock(h);
@@ -1956,18 +2029,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-			primary_bkt[i], secondary_bkt[i],
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					__builtin_ctzl(prim_hitmask[i])
+					__builtin_ctzl(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
@@ -1979,9 +2058,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					__builtin_ctzl(sec_hitmask[i])
+					__builtin_ctzl(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
@@ -1996,9 +2075,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					__builtin_ctzl(prim_hitmask[i])
+					__builtin_ctzl(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
@@ -2021,12 +2108,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					__builtin_ctzl(sec_hitmask[i])
+					__builtin_ctzl(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
@@ -2050,7 +2137,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2100,15 +2187,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	for (i = 0; i < num_keys; i++)
@@ -2125,18 +2215,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-				primary_bkt[i], secondary_bkt[i],
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						__builtin_ctzl(prim_hitmask[i])
+						__builtin_ctzl(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
@@ -2148,9 +2244,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						__builtin_ctzl(sec_hitmask[i])
+						__builtin_ctzl(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
@@ -2164,9 +2260,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						__builtin_ctzl(prim_hitmask[i])
+						__builtin_ctzl(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
@@ -2193,12 +2297,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						__builtin_ctzl(sec_hitmask[i])
+						__builtin_ctzl(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
@@ -2226,7 +2330,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index eb2644f74b..356ec2a69e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -148,6 +148,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v5 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (3 preceding siblings ...)
  2023-10-20 16:51 ` [PATCH v2 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-02-27 17:41 ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
                   ` (3 subsequent siblings)
  8 siblings, 4 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:41 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi

From: Yoan Picchi <yoan.picchi@foss.arm.com>

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

V3->V4:
  Rebase

V4->V5:
  Commit message

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                   |   2 +
 app/test/test_hash.c       |  99 ++++++++++----
 lib/hash/rte_cuckoo_hash.c | 264 +++++++++++++++++++++++++++++--------
 lib/hash/rte_cuckoo_hash.h |   1 +
 4 files changed, 287 insertions(+), 79 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.mailmap b/.mailmap
index 12d2875641..60500bbe36 100644
--- a/.mailmap
+++ b/.mailmap
@@ -492,6 +492,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1625,6 +1626,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0550165584 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1857,8 +1857,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << i);
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << i);
+		}
+	}
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
@@ -1885,25 +1927,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 		/* Extract the even-index bits only */
 		*sec_hash_matches &= 0x5555;
 		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
+#endif /* defined(__SSE2__) */
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
 			*prim_hash_matches |=
@@ -1914,6 +1938,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 	}
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1928,18 +1954,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+			primary_bkt[i], secondary_bkt[i],
+			sig[i], h->sig_cmp_fn);
+#else
+		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+#endif
 
 		if (prim_hitmask[i]) {
 			uint32_t first_hit =
 					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1953,7 +1991,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		if (sec_hitmask[i]) {
 			uint32_t first_hit =
 					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1970,7 +2008,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		while (prim_hitmask[i]) {
 			uint32_t hit_index =
 					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +2030,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 
 		while (sec_hitmask[i]) {
 			uint32_t hit_index =
 					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2059,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2076,6 +2114,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2133,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+#else
+			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+				primary_bkt[i], secondary_bkt[i],
+				sig[i], h->sig_cmp_fn);
+#endif
 
 			if (prim_hitmask[i]) {
 				uint32_t first_hit =
 						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2110,7 +2160,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			if (sec_hitmask[i]) {
 				uint32_t first_hit =
 						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2126,7 +2176,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2202,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 
 			while (sec_hitmask[i]) {
 				uint32_t hit_index =
 						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2235,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v5 2/4] hash: optimize compare signature for NEON
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0550165584..a07dd3a28d 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1871,19 +1871,17 @@ compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16x8_t vmat, x;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
 		}
 		break;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..c4e7f8190e 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | 3 << 16;
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-02-27 17:42   ` [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-28 10:56     ` Konstantin Ananyev
  3 siblings, 1 reply; 43+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang

- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
 lib/hash/rte_cuckoo_hash.h |   1 +
 2 files changed, 151 insertions(+), 46 deletions(-)

diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a07dd3a28d..231d6d6ded 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 #if defined(__ARM_NEON)
 
 static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
 	unsigned int i;
 
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, x;
+		uint16x8_t vmat, hit1, hit2;
 		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
+
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
+		}
+		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
 		}
 		break;
+#endif
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << i);
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << i);
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
@@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	__hash_rw_reader_lock(h);
@@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-			primary_bkt[i], secondary_bkt[i],
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
+					rte_ctz32(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
@@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
+					rte_ctz32(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
@@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
+					rte_ctz32(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
@@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
+					rte_ctz32(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
@@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	for (i = 0; i < num_keys; i++)
@@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-				primary_bkt[i], secondary_bkt[i],
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
+						rte_ctz32(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
@@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
+						rte_ctz32(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
@@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
+						rte_ctz32(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
@@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
+						rte_ctz32(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
@@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index 8ea793c66e..ed18e1f41e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-02-28 10:56     ` Konstantin Ananyev
  2024-02-28 14:48       ` Yoan Picchi
  0 siblings, 1 reply; 43+ messages in thread
From: Konstantin Ananyev @ 2024-02-28 10:56 UTC (permalink / raw)
  To: Yoan Picchi, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang


> 
> - Implemented SVE code for comparing signatures in bulk lookup.
> - Added Defines in code for SVE code support.
> - Optimise NEON code
> - New SVE code is ~5% slower than optimized NEON for N2 processor.
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
>  lib/hash/rte_cuckoo_hash.h |   1 +
>  2 files changed, 151 insertions(+), 46 deletions(-)
> 
> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> index a07dd3a28d..231d6d6ded 100644
> --- a/lib/hash/rte_cuckoo_hash.c
> +++ b/lib/hash/rte_cuckoo_hash.c
> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>  	else
>  #elif defined(RTE_ARCH_ARM64)
> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> +	}
>  	else
>  #endif
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>  #if defined(__ARM_NEON)
> 
>  static inline void
> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> -			const struct rte_hash_bucket *prim_bkt,
> -			const struct rte_hash_bucket *sec_bkt,
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
>  			uint16_t sig,
>  			enum rte_hash_sig_compare_function sig_cmp_fn)
>  {
>  	unsigned int i;
> 
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
>  	/* For match mask every bits indicates the match */
>  	switch (sig_cmp_fn) {

Can I ask to move arch specific comparison code into some arch-specific headers or so?
It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...

> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>  	case RTE_HASH_COMPARE_NEON: {
> -		uint16x8_t vmat, x;
> +		uint16x8_t vmat, hit1, hit2;
>  		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
>  		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
> 
>  		/* Compare all signatures in the primary bucket */
> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
> -		x = vandq_u16(vmat, mask);
> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
> +		hit1 = vandq_u16(vmat, mask);
> +
>  		/* Compare all signatures in the secondary bucket */
> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
> -		x = vandq_u16(vmat, mask);
> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
> +		hit2 = vandq_u16(vmat, mask);
> +
> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
> +		hit2 = vorrq_u16(hit1, hit2);
> +		*hitmask_buffer = vaddvq_u16(hit2);
> +		}
> +		break;
> +#endif
> +#if defined(RTE_HAS_SVE_ACLE)
> +	case RTE_HASH_COMPARE_SVE: {
> +		svuint16_t vsign, shift, sv_matches;
> +		svbool_t pred, match, bucket_wide_pred;
> +		int i = 0;
> +		uint64_t vl = svcnth();
> +
> +		vsign = svdup_u16(sig);
> +		shift = svindex_u16(0, 1);
> +
> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
> +			svuint16_t primary_array_vect, secondary_array_vect;
> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
> +
> +			/* We merged the two vectors so we can do both comparison at once */
> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
> +				primary_array_vect,
> +				secondary_array_vect);
> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
> +
> +			/* Compare all signatures in the buckets */
> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
> +			if (svptest_any(svptrue_b16(), match)) {
> +				sv_matches = svdup_u16(1);
> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
> +			}
> +		} else {
> +			do {
> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
> +				uint16_t lower_half = 0;
> +				uint16_t upper_half = 0;
> +				/* Compare all signatures in the primary bucket */
> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> +							&prim_bucket_sigs[i]));
> +				if (svptest_any(svptrue_b16(), match)) {
> +					sv_matches = svdup_u16(1);
> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
> +				}
> +				/* Compare all signatures in the secondary bucket */
> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> +							&sec_bucket_sigs[i]));
> +				if (svptest_any(svptrue_b16(), match)) {
> +					sv_matches = svdup_u16(1);
> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
> +						<< RTE_HASH_BUCKET_ENTRIES;
> +				}
> +				hitmask_buffer[i/8] = upper_half | lower_half;
> +				i += vl;
> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
> +		}
>  		}
>  		break;
> +#endif
>  	default:
>  		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> -			*prim_hash_matches |=
> -				((sig == prim_bkt->sig_current[i]) << i);
> -			*sec_hash_matches |=
> -				((sig == sec_bkt->sig_current[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == prim_bucket_sigs[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>  		}
>  	}
>  }
> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
> 
>  	/* For match mask the first bit of every two bits indicates the match */
>  	switch (sig_cmp_fn) {
> -#if defined(__SSE2__)
> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>  	case RTE_HASH_COMPARE_SSE:
>  		/* Compare all signatures in the bucket */
>  		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  	uint64_t hits = 0;
>  	int32_t i;
>  	int32_t ret;
> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  	struct rte_hash_bucket *cur_bkt, *next_bkt;
> 
>  #if defined(__ARM_NEON)
>  	const int hitmask_padding = 0;
> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +
> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
>  #else
>  	const int hitmask_padding = 1;
> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  #endif
> 
>  	__hash_rw_reader_lock(h);
> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  	/* Compare signatures and prefetch key slot of first hit */
>  	for (i = 0; i < num_keys; i++) {
>  #if defined(__ARM_NEON)
> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> -			primary_bkt[i], secondary_bkt[i],
> +		uint16_t *hitmask = &hitmask_buffer[i];
> +		compare_signatures_dense(hitmask,
> +			primary_bkt[i]->sig_current,
> +			secondary_bkt[i]->sig_current,
>  			sig[i], h->sig_cmp_fn);
> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>  #else
> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>  			primary_bkt[i], secondary_bkt[i],
>  			sig[i], h->sig_cmp_fn);
> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>  #endif
> 
> -		if (prim_hitmask[i]) {
> +		if (prim_hitmask) {
>  			uint32_t first_hit =
> -					rte_ctz32(prim_hitmask[i])
> +					rte_ctz32(prim_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				primary_bkt[i]->key_idx[first_hit];
> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  			continue;
>  		}
> 
> -		if (sec_hitmask[i]) {
> +		if (sec_hitmask) {
>  			uint32_t first_hit =
> -					rte_ctz32(sec_hitmask[i])
> +					rte_ctz32(sec_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				secondary_bkt[i]->key_idx[first_hit];
> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  	/* Compare keys, first hits in primary first */
>  	for (i = 0; i < num_keys; i++) {
>  		positions[i] = -ENOENT;
> -		while (prim_hitmask[i]) {
> +#if defined(__ARM_NEON)
> +		uint16_t *hitmask = &hitmask_buffer[i];
> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> +#else
> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
> +#endif
> +		while (prim_hitmask) {
>  			uint32_t hit_index =
> -					rte_ctz32(prim_hitmask[i])
> +					rte_ctz32(prim_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				primary_bkt[i]->key_idx[hit_index];
> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  				positions[i] = key_idx - 1;
>  				goto next_key;
>  			}
> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  		}
> 
> -		while (sec_hitmask[i]) {
> +		while (sec_hitmask) {
>  			uint32_t hit_index =
> -					rte_ctz32(sec_hitmask[i])
> +					rte_ctz32(sec_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				secondary_bkt[i]->key_idx[hit_index];
> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  				positions[i] = key_idx - 1;
>  				goto next_key;
>  			}
> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  		}
>  next_key:
>  		continue;
> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  	uint64_t hits = 0;
>  	int32_t i;
>  	int32_t ret;
> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  	struct rte_hash_bucket *cur_bkt, *next_bkt;
>  	uint32_t cnt_b, cnt_a;
> 
>  #if defined(__ARM_NEON)
>  	const int hitmask_padding = 0;
> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
>  #else
>  	const int hitmask_padding = 1;
> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  #endif
> 
>  	for (i = 0; i < num_keys; i++)
> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  		/* Compare signatures and prefetch key slot of first hit */
>  		for (i = 0; i < num_keys; i++) {
>  #if defined(__ARM_NEON)
> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> -				primary_bkt[i], secondary_bkt[i],
> +			uint16_t *hitmask = &hitmask_buffer[i];
> +			compare_signatures_dense(hitmask,
> +				primary_bkt[i]->sig_current,
> +				secondary_bkt[i]->sig_current,
>  				sig[i], h->sig_cmp_fn);
> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>  #else
> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>  				primary_bkt[i], secondary_bkt[i],
>  				sig[i], h->sig_cmp_fn);
> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>  #endif
> 
> -			if (prim_hitmask[i]) {
> +			if (prim_hitmask) {
>  				uint32_t first_hit =
> -						rte_ctz32(prim_hitmask[i])
> +						rte_ctz32(prim_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  					primary_bkt[i]->key_idx[first_hit];
> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  				continue;
>  			}
> 
> -			if (sec_hitmask[i]) {
> +			if (sec_hitmask) {
>  				uint32_t first_hit =
> -						rte_ctz32(sec_hitmask[i])
> +						rte_ctz32(sec_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  					secondary_bkt[i]->key_idx[first_hit];
> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> 
>  		/* Compare keys, first hits in primary first */
>  		for (i = 0; i < num_keys; i++) {
> -			while (prim_hitmask[i]) {
> +#if defined(__ARM_NEON)
> +			uint16_t *hitmask = &hitmask_buffer[i];
> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> +#else
> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
> +#endif
> +			while (prim_hitmask) {
>  				uint32_t hit_index =
> -						rte_ctz32(prim_hitmask[i])
> +						rte_ctz32(prim_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  				rte_atomic_load_explicit(
> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  					positions[i] = key_idx - 1;
>  					goto next_key;
>  				}
> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  			}
> 
> -			while (sec_hitmask[i]) {
> +			while (sec_hitmask) {
>  				uint32_t hit_index =
> -						rte_ctz32(sec_hitmask[i])
> +						rte_ctz32(sec_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  				rte_atomic_load_explicit(
> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  					positions[i] = key_idx - 1;
>  					goto next_key;
>  				}
> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  			}
>  next_key:
>  			continue;
> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
> index 8ea793c66e..ed18e1f41e 100644
> --- a/lib/hash/rte_cuckoo_hash.h
> +++ b/lib/hash/rte_cuckoo_hash.h
> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
>  	RTE_HASH_COMPARE_SCALAR = 0,
>  	RTE_HASH_COMPARE_SSE,
>  	RTE_HASH_COMPARE_NEON,
> +	RTE_HASH_COMPARE_SVE,
>  	RTE_HASH_COMPARE_NUM
>  };
> 
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-28 10:56     ` Konstantin Ananyev
@ 2024-02-28 14:48       ` Yoan Picchi
  2024-03-04 13:35         ` Konstantin Ananyev
  0 siblings, 1 reply; 43+ messages in thread
From: Yoan Picchi @ 2024-02-28 14:48 UTC (permalink / raw)
  To: Konstantin Ananyev, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang

On 2/28/24 10:56, Konstantin Ananyev wrote:
> 
>>
>> - Implemented SVE code for comparing signatures in bulk lookup.
>> - Added Defines in code for SVE code support.
>> - Optimise NEON code
>> - New SVE code is ~5% slower than optimized NEON for N2 processor.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>>   lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
>>   lib/hash/rte_cuckoo_hash.h |   1 +
>>   2 files changed, 151 insertions(+), 46 deletions(-)
>>
>> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
>> index a07dd3a28d..231d6d6ded 100644
>> --- a/lib/hash/rte_cuckoo_hash.c
>> +++ b/lib/hash/rte_cuckoo_hash.c
>> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>>   	else
>>   #elif defined(RTE_ARCH_ARM64)
>> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
>> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
>> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
>> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
>> +	}
>>   	else
>>   #endif
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
>> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>>   #if defined(__ARM_NEON)
>>
>>   static inline void
>> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> -			const struct rte_hash_bucket *prim_bkt,
>> -			const struct rte_hash_bucket *sec_bkt,
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>>   			uint16_t sig,
>>   			enum rte_hash_sig_compare_function sig_cmp_fn)
>>   {
>>   	unsigned int i;
>>
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>>   	/* For match mask every bits indicates the match */
>>   	switch (sig_cmp_fn) {
> 
> Can I ask to move arch specific comparison code into some arch-specific headers or so?
> It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
> 

I can easily enough move the compare_signatures into an arm/x86 
directory, and have a default version in the code.
The problem would be for bulk lookup. The function is already duplicated 
  2 times (the l and lf version). If I remove the #ifdefs, I'll need to 
duplicate them again into 4 nearly identical versions (dense and 
sparse). The only third options I see would be some preprocessor macro 
to patch the function, but that looks even dirtier to me.
I think duplicating the code would be bad, but I can do it if you want. 
Unless you have a better solution?

>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>>   	case RTE_HASH_COMPARE_NEON: {
>> -		uint16x8_t vmat, x;
>> +		uint16x8_t vmat, hit1, hit2;
>>   		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
>>   		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
>>
>>   		/* Compare all signatures in the primary bucket */
>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
>> -		x = vandq_u16(vmat, mask);
>> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
>> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
>> +		hit1 = vandq_u16(vmat, mask);
>> +
>>   		/* Compare all signatures in the secondary bucket */
>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
>> -		x = vandq_u16(vmat, mask);
>> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
>> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
>> +		hit2 = vandq_u16(vmat, mask);
>> +
>> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
>> +		hit2 = vorrq_u16(hit1, hit2);
>> +		*hitmask_buffer = vaddvq_u16(hit2);
>> +		}
>> +		break;
>> +#endif
>> +#if defined(RTE_HAS_SVE_ACLE)
>> +	case RTE_HASH_COMPARE_SVE: {
>> +		svuint16_t vsign, shift, sv_matches;
>> +		svbool_t pred, match, bucket_wide_pred;
>> +		int i = 0;
>> +		uint64_t vl = svcnth();
>> +
>> +		vsign = svdup_u16(sig);
>> +		shift = svindex_u16(0, 1);
>> +
>> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
>> +			svuint16_t primary_array_vect, secondary_array_vect;
>> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
>> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
>> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
>> +
>> +			/* We merged the two vectors so we can do both comparison at once */
>> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
>> +				primary_array_vect,
>> +				secondary_array_vect);
>> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
>> +
>> +			/* Compare all signatures in the buckets */
>> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
>> +			if (svptest_any(svptrue_b16(), match)) {
>> +				sv_matches = svdup_u16(1);
>> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
>> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
>> +			}
>> +		} else {
>> +			do {
>> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
>> +				uint16_t lower_half = 0;
>> +				uint16_t upper_half = 0;
>> +				/* Compare all signatures in the primary bucket */
>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>> +							&prim_bucket_sigs[i]));
>> +				if (svptest_any(svptrue_b16(), match)) {
>> +					sv_matches = svdup_u16(1);
>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
>> +				}
>> +				/* Compare all signatures in the secondary bucket */
>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>> +							&sec_bucket_sigs[i]));
>> +				if (svptest_any(svptrue_b16(), match)) {
>> +					sv_matches = svdup_u16(1);
>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
>> +						<< RTE_HASH_BUCKET_ENTRIES;
>> +				}
>> +				hitmask_buffer[i/8] = upper_half | lower_half;
>> +				i += vl;
>> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
>> +		}
>>   		}
>>   		break;
>> +#endif
>>   	default:
>>   		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> -			*prim_hash_matches |=
>> -				((sig == prim_bkt->sig_current[i]) << i);
>> -			*sec_hash_matches |=
>> -				((sig == sec_bkt->sig_current[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == prim_bucket_sigs[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>>   		}
>>   	}
>>   }
>> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
>>
>>   	/* For match mask the first bit of every two bits indicates the match */
>>   	switch (sig_cmp_fn) {
>> -#if defined(__SSE2__)
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>>   	case RTE_HASH_COMPARE_SSE:
>>   		/* Compare all signatures in the bucket */
>>   		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   	uint64_t hits = 0;
>>   	int32_t i;
>>   	int32_t ret;
>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>
>>   #if defined(__ARM_NEON)
>>   	const int hitmask_padding = 0;
>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +
>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
>>   #else
>>   	const int hitmask_padding = 1;
>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   #endif
>>
>>   	__hash_rw_reader_lock(h);
>> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   	/* Compare signatures and prefetch key slot of first hit */
>>   	for (i = 0; i < num_keys; i++) {
>>   #if defined(__ARM_NEON)
>> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>> -			primary_bkt[i], secondary_bkt[i],
>> +		uint16_t *hitmask = &hitmask_buffer[i];
>> +		compare_signatures_dense(hitmask,
>> +			primary_bkt[i]->sig_current,
>> +			secondary_bkt[i]->sig_current,
>>   			sig[i], h->sig_cmp_fn);
>> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>   #else
>> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>   			primary_bkt[i], secondary_bkt[i],
>>   			sig[i], h->sig_cmp_fn);
>> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>   #endif
>>
>> -		if (prim_hitmask[i]) {
>> +		if (prim_hitmask) {
>>   			uint32_t first_hit =
>> -					rte_ctz32(prim_hitmask[i])
>> +					rte_ctz32(prim_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				primary_bkt[i]->key_idx[first_hit];
>> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   			continue;
>>   		}
>>
>> -		if (sec_hitmask[i]) {
>> +		if (sec_hitmask) {
>>   			uint32_t first_hit =
>> -					rte_ctz32(sec_hitmask[i])
>> +					rte_ctz32(sec_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				secondary_bkt[i]->key_idx[first_hit];
>> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   	/* Compare keys, first hits in primary first */
>>   	for (i = 0; i < num_keys; i++) {
>>   		positions[i] = -ENOENT;
>> -		while (prim_hitmask[i]) {
>> +#if defined(__ARM_NEON)
>> +		uint16_t *hitmask = &hitmask_buffer[i];
>> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>> +#else
>> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
>> +#endif
>> +		while (prim_hitmask) {
>>   			uint32_t hit_index =
>> -					rte_ctz32(prim_hitmask[i])
>> +					rte_ctz32(prim_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				primary_bkt[i]->key_idx[hit_index];
>> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   				positions[i] = key_idx - 1;
>>   				goto next_key;
>>   			}
>> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   		}
>>
>> -		while (sec_hitmask[i]) {
>> +		while (sec_hitmask) {
>>   			uint32_t hit_index =
>> -					rte_ctz32(sec_hitmask[i])
>> +					rte_ctz32(sec_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				secondary_bkt[i]->key_idx[hit_index];
>> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   				positions[i] = key_idx - 1;
>>   				goto next_key;
>>   			}
>> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   		}
>>   next_key:
>>   		continue;
>> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   	uint64_t hits = 0;
>>   	int32_t i;
>>   	int32_t ret;
>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>   	uint32_t cnt_b, cnt_a;
>>
>>   #if defined(__ARM_NEON)
>>   	const int hitmask_padding = 0;
>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
>>   #else
>>   	const int hitmask_padding = 1;
>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   #endif
>>
>>   	for (i = 0; i < num_keys; i++)
>> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   		/* Compare signatures and prefetch key slot of first hit */
>>   		for (i = 0; i < num_keys; i++) {
>>   #if defined(__ARM_NEON)
>> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>> -				primary_bkt[i], secondary_bkt[i],
>> +			uint16_t *hitmask = &hitmask_buffer[i];
>> +			compare_signatures_dense(hitmask,
>> +				primary_bkt[i]->sig_current,
>> +				secondary_bkt[i]->sig_current,
>>   				sig[i], h->sig_cmp_fn);
>> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>   #else
>> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>   				primary_bkt[i], secondary_bkt[i],
>>   				sig[i], h->sig_cmp_fn);
>> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>   #endif
>>
>> -			if (prim_hitmask[i]) {
>> +			if (prim_hitmask) {
>>   				uint32_t first_hit =
>> -						rte_ctz32(prim_hitmask[i])
>> +						rte_ctz32(prim_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   					primary_bkt[i]->key_idx[first_hit];
>> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   				continue;
>>   			}
>>
>> -			if (sec_hitmask[i]) {
>> +			if (sec_hitmask) {
>>   				uint32_t first_hit =
>> -						rte_ctz32(sec_hitmask[i])
>> +						rte_ctz32(sec_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   					secondary_bkt[i]->key_idx[first_hit];
>> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>
>>   		/* Compare keys, first hits in primary first */
>>   		for (i = 0; i < num_keys; i++) {
>> -			while (prim_hitmask[i]) {
>> +#if defined(__ARM_NEON)
>> +			uint16_t *hitmask = &hitmask_buffer[i];
>> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>> +#else
>> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
>> +#endif
>> +			while (prim_hitmask) {
>>   				uint32_t hit_index =
>> -						rte_ctz32(prim_hitmask[i])
>> +						rte_ctz32(prim_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   				rte_atomic_load_explicit(
>> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   					positions[i] = key_idx - 1;
>>   					goto next_key;
>>   				}
>> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   			}
>>
>> -			while (sec_hitmask[i]) {
>> +			while (sec_hitmask) {
>>   				uint32_t hit_index =
>> -						rte_ctz32(sec_hitmask[i])
>> +						rte_ctz32(sec_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   				rte_atomic_load_explicit(
>> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   					positions[i] = key_idx - 1;
>>   					goto next_key;
>>   				}
>> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   			}
>>   next_key:
>>   			continue;
>> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
>> index 8ea793c66e..ed18e1f41e 100644
>> --- a/lib/hash/rte_cuckoo_hash.h
>> +++ b/lib/hash/rte_cuckoo_hash.h
>> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
>>   	RTE_HASH_COMPARE_SCALAR = 0,
>>   	RTE_HASH_COMPARE_SSE,
>>   	RTE_HASH_COMPARE_NEON,
>> +	RTE_HASH_COMPARE_SVE,
>>   	RTE_HASH_COMPARE_NUM
>>   };
>>
>> --
>> 2.34.1
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-28 14:48       ` Yoan Picchi
@ 2024-03-04 13:35         ` Konstantin Ananyev
  2024-03-05 15:36           ` Yoan Picchi
  0 siblings, 1 reply; 43+ messages in thread
From: Konstantin Ananyev @ 2024-03-04 13:35 UTC (permalink / raw)
  To: Yoan Picchi, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang



> >> - Implemented SVE code for comparing signatures in bulk lookup.
> >> - Added Defines in code for SVE code support.
> >> - Optimise NEON code
> >> - New SVE code is ~5% slower than optimized NEON for N2 processor.
> >>
> >> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> >> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> >> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> >> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >> ---
> >>   lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
> >>   lib/hash/rte_cuckoo_hash.h |   1 +
> >>   2 files changed, 151 insertions(+), 46 deletions(-)
> >>
> >> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> >> index a07dd3a28d..231d6d6ded 100644
> >> --- a/lib/hash/rte_cuckoo_hash.c
> >> +++ b/lib/hash/rte_cuckoo_hash.c
> >> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
> >>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
> >>   	else
> >>   #elif defined(RTE_ARCH_ARM64)
> >> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
> >> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
> >>   		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> >> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> >> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> >> +	}
> >>   	else
> >>   #endif
> >>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
> >> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
> >>   #if defined(__ARM_NEON)
> >>
> >>   static inline void
> >> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> >> -			const struct rte_hash_bucket *prim_bkt,
> >> -			const struct rte_hash_bucket *sec_bkt,
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >>   			uint16_t sig,
> >>   			enum rte_hash_sig_compare_function sig_cmp_fn)
> >>   {
> >>   	unsigned int i;
> >>
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >>   	/* For match mask every bits indicates the match */
> >>   	switch (sig_cmp_fn) {
> >
> > Can I ask to move arch specific comparison code into some arch-specific headers or so?
> > It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
> >

Hi, apologies for long delay in response. 

 
> I can easily enough move the compare_signatures into an arm/x86
> directory, and have a default version in the code.

Yes, that's what I thought about.
 
> The problem would be for bulk lookup. The function is already duplicated
>   2 times (the l and lf version). If I remove the #ifdefs, I'll need to
> duplicate them again into 4 nearly identical versions (dense and
> sparse). The only third options I see would be some preprocessor macro
> to patch the function, but that looks even dirtier to me.

Not sure I understood you here: from looking at the code I don't see any
arch specific ifdefs in bulk_lookup() routines.
What I am missing here?
 

> I think duplicating the code would be bad, but I can do it if you want.
> Unless you have a better solution?
> 
> >> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> >>   	case RTE_HASH_COMPARE_NEON: {
> >> -		uint16x8_t vmat, x;
> >> +		uint16x8_t vmat, hit1, hit2;
> >>   		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
> >>   		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
> >>
> >>   		/* Compare all signatures in the primary bucket */
> >> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
> >> -		x = vandq_u16(vmat, mask);
> >> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
> >> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
> >> +		hit1 = vandq_u16(vmat, mask);
> >> +
> >>   		/* Compare all signatures in the secondary bucket */
> >> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
> >> -		x = vandq_u16(vmat, mask);
> >> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
> >> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
> >> +		hit2 = vandq_u16(vmat, mask);
> >> +
> >> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
> >> +		hit2 = vorrq_u16(hit1, hit2);
> >> +		*hitmask_buffer = vaddvq_u16(hit2);
> >> +		}
> >> +		break;
> >> +#endif
> >> +#if defined(RTE_HAS_SVE_ACLE)
> >> +	case RTE_HASH_COMPARE_SVE: {
> >> +		svuint16_t vsign, shift, sv_matches;
> >> +		svbool_t pred, match, bucket_wide_pred;
> >> +		int i = 0;
> >> +		uint64_t vl = svcnth();
> >> +
> >> +		vsign = svdup_u16(sig);
> >> +		shift = svindex_u16(0, 1);
> >> +
> >> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
> >> +			svuint16_t primary_array_vect, secondary_array_vect;
> >> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
> >> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
> >> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
> >> +
> >> +			/* We merged the two vectors so we can do both comparison at once */
> >> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
> >> +				primary_array_vect,
> >> +				secondary_array_vect);
> >> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
> >> +
> >> +			/* Compare all signatures in the buckets */
> >> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
> >> +			if (svptest_any(svptrue_b16(), match)) {
> >> +				sv_matches = svdup_u16(1);
> >> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
> >> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
> >> +			}
> >> +		} else {
> >> +			do {
> >> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
> >> +				uint16_t lower_half = 0;
> >> +				uint16_t upper_half = 0;
> >> +				/* Compare all signatures in the primary bucket */
> >> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> >> +							&prim_bucket_sigs[i]));
> >> +				if (svptest_any(svptrue_b16(), match)) {
> >> +					sv_matches = svdup_u16(1);
> >> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> >> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
> >> +				}
> >> +				/* Compare all signatures in the secondary bucket */
> >> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> >> +							&sec_bucket_sigs[i]));
> >> +				if (svptest_any(svptrue_b16(), match)) {
> >> +					sv_matches = svdup_u16(1);
> >> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> >> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
> >> +						<< RTE_HASH_BUCKET_ENTRIES;
> >> +				}
> >> +				hitmask_buffer[i/8] = upper_half | lower_half;
> >> +				i += vl;
> >> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
> >> +		}
> >>   		}
> >>   		break;
> >> +#endif
> >>   	default:
> >>   		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> -			*prim_hash_matches |=
> >> -				((sig == prim_bkt->sig_current[i]) << i);
> >> -			*sec_hash_matches |=
> >> -				((sig == sec_bkt->sig_current[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == prim_bucket_sigs[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >>   		}
> >>   	}
> >>   }
> >> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
> >>
> >>   	/* For match mask the first bit of every two bits indicates the match */
> >>   	switch (sig_cmp_fn) {
> >> -#if defined(__SSE2__)
> >> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> >>   	case RTE_HASH_COMPARE_SSE:
> >>   		/* Compare all signatures in the bucket */
> >>   		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   	uint64_t hits = 0;
> >>   	int32_t i;
> >>   	int32_t ret;
> >> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
> >>
> >>   #if defined(__ARM_NEON)
> >>   	const int hitmask_padding = 0;
> >> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
> >>   #else
> >>   	const int hitmask_padding = 1;
> >> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   #endif
> >>
> >>   	__hash_rw_reader_lock(h);
> >> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   	/* Compare signatures and prefetch key slot of first hit */
> >>   	for (i = 0; i < num_keys; i++) {
> >>   #if defined(__ARM_NEON)
> >> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> >> -			primary_bkt[i], secondary_bkt[i],
> >> +		uint16_t *hitmask = &hitmask_buffer[i];
> >> +		compare_signatures_dense(hitmask,
> >> +			primary_bkt[i]->sig_current,
> >> +			secondary_bkt[i]->sig_current,
> >>   			sig[i], h->sig_cmp_fn);
> >> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >>   #else
> >> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> >> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
> >>   			primary_bkt[i], secondary_bkt[i],
> >>   			sig[i], h->sig_cmp_fn);
> >> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >>   #endif
> >>
> >> -		if (prim_hitmask[i]) {
> >> +		if (prim_hitmask) {
> >>   			uint32_t first_hit =
> >> -					rte_ctz32(prim_hitmask[i])
> >> +					rte_ctz32(prim_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				primary_bkt[i]->key_idx[first_hit];
> >> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   			continue;
> >>   		}
> >>
> >> -		if (sec_hitmask[i]) {
> >> +		if (sec_hitmask) {
> >>   			uint32_t first_hit =
> >> -					rte_ctz32(sec_hitmask[i])
> >> +					rte_ctz32(sec_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				secondary_bkt[i]->key_idx[first_hit];
> >> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   	/* Compare keys, first hits in primary first */
> >>   	for (i = 0; i < num_keys; i++) {
> >>   		positions[i] = -ENOENT;
> >> -		while (prim_hitmask[i]) {
> >> +#if defined(__ARM_NEON)
> >> +		uint16_t *hitmask = &hitmask_buffer[i];
> >> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >> +#else
> >> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >> +#endif
> >> +		while (prim_hitmask) {
> >>   			uint32_t hit_index =
> >> -					rte_ctz32(prim_hitmask[i])
> >> +					rte_ctz32(prim_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				primary_bkt[i]->key_idx[hit_index];
> >> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   				positions[i] = key_idx - 1;
> >>   				goto next_key;
> >>   			}
> >> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   		}
> >>
> >> -		while (sec_hitmask[i]) {
> >> +		while (sec_hitmask) {
> >>   			uint32_t hit_index =
> >> -					rte_ctz32(sec_hitmask[i])
> >> +					rte_ctz32(sec_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				secondary_bkt[i]->key_idx[hit_index];
> >> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   				positions[i] = key_idx - 1;
> >>   				goto next_key;
> >>   			}
> >> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   		}
> >>   next_key:
> >>   		continue;
> >> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   	uint64_t hits = 0;
> >>   	int32_t i;
> >>   	int32_t ret;
> >> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
> >>   	uint32_t cnt_b, cnt_a;
> >>
> >>   #if defined(__ARM_NEON)
> >>   	const int hitmask_padding = 0;
> >> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
> >>   #else
> >>   	const int hitmask_padding = 1;
> >> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   #endif
> >>
> >>   	for (i = 0; i < num_keys; i++)
> >> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   		/* Compare signatures and prefetch key slot of first hit */
> >>   		for (i = 0; i < num_keys; i++) {
> >>   #if defined(__ARM_NEON)
> >> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> >> -				primary_bkt[i], secondary_bkt[i],
> >> +			uint16_t *hitmask = &hitmask_buffer[i];
> >> +			compare_signatures_dense(hitmask,
> >> +				primary_bkt[i]->sig_current,
> >> +				secondary_bkt[i]->sig_current,
> >>   				sig[i], h->sig_cmp_fn);
> >> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >>   #else
> >> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> >> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
> >>   				primary_bkt[i], secondary_bkt[i],
> >>   				sig[i], h->sig_cmp_fn);
> >> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >>   #endif
> >>
> >> -			if (prim_hitmask[i]) {
> >> +			if (prim_hitmask) {
> >>   				uint32_t first_hit =
> >> -						rte_ctz32(prim_hitmask[i])
> >> +						rte_ctz32(prim_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   					primary_bkt[i]->key_idx[first_hit];
> >> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   				continue;
> >>   			}
> >>
> >> -			if (sec_hitmask[i]) {
> >> +			if (sec_hitmask) {
> >>   				uint32_t first_hit =
> >> -						rte_ctz32(sec_hitmask[i])
> >> +						rte_ctz32(sec_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   					secondary_bkt[i]->key_idx[first_hit];
> >> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>
> >>   		/* Compare keys, first hits in primary first */
> >>   		for (i = 0; i < num_keys; i++) {
> >> -			while (prim_hitmask[i]) {
> >> +#if defined(__ARM_NEON)
> >> +			uint16_t *hitmask = &hitmask_buffer[i];
> >> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >> +#else
> >> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >> +#endif
> >> +			while (prim_hitmask) {
> >>   				uint32_t hit_index =
> >> -						rte_ctz32(prim_hitmask[i])
> >> +						rte_ctz32(prim_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   				rte_atomic_load_explicit(
> >> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   					positions[i] = key_idx - 1;
> >>   					goto next_key;
> >>   				}
> >> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   			}
> >>
> >> -			while (sec_hitmask[i]) {
> >> +			while (sec_hitmask) {
> >>   				uint32_t hit_index =
> >> -						rte_ctz32(sec_hitmask[i])
> >> +						rte_ctz32(sec_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   				rte_atomic_load_explicit(
> >> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   					positions[i] = key_idx - 1;
> >>   					goto next_key;
> >>   				}
> >> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   			}
> >>   next_key:
> >>   			continue;
> >> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
> >> index 8ea793c66e..ed18e1f41e 100644
> >> --- a/lib/hash/rte_cuckoo_hash.h
> >> +++ b/lib/hash/rte_cuckoo_hash.h
> >> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
> >>   	RTE_HASH_COMPARE_SCALAR = 0,
> >>   	RTE_HASH_COMPARE_SSE,
> >>   	RTE_HASH_COMPARE_NEON,
> >> +	RTE_HASH_COMPARE_SVE,
> >>   	RTE_HASH_COMPARE_NUM
> >>   };
> >>
> >> --
> >> 2.34.1
> >


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-03-04 13:35         ` Konstantin Ananyev
@ 2024-03-05 15:36           ` Yoan Picchi
  0 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-05 15:36 UTC (permalink / raw)
  To: Konstantin Ananyev, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang

On 3/4/24 13:35, Konstantin Ananyev wrote:
> 
> 
>>>> - Implemented SVE code for comparing signatures in bulk lookup.
>>>> - Added Defines in code for SVE code support.
>>>> - Optimise NEON code
>>>> - New SVE code is ~5% slower than optimized NEON for N2 processor.
>>>>
>>>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>>>> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
>>>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>>>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>>>> ---
>>>>    lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
>>>>    lib/hash/rte_cuckoo_hash.h |   1 +
>>>>    2 files changed, 151 insertions(+), 46 deletions(-)
>>>>
>>>> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
>>>> index a07dd3a28d..231d6d6ded 100644
>>>> --- a/lib/hash/rte_cuckoo_hash.c
>>>> +++ b/lib/hash/rte_cuckoo_hash.c
>>>> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
>>>>    		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>>>>    	else
>>>>    #elif defined(RTE_ARCH_ARM64)
>>>> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
>>>> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>>>>    		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
>>>> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
>>>> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
>>>> +	}
>>>>    	else
>>>>    #endif
>>>>    		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
>>>> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>>>>    #if defined(__ARM_NEON)
>>>>
>>>>    static inline void
>>>> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>>>> -			const struct rte_hash_bucket *prim_bkt,
>>>> -			const struct rte_hash_bucket *sec_bkt,
>>>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>>>> +			const uint16_t *prim_bucket_sigs,
>>>> +			const uint16_t *sec_bucket_sigs,
>>>>    			uint16_t sig,
>>>>    			enum rte_hash_sig_compare_function sig_cmp_fn)
>>>>    {
>>>>    	unsigned int i;
>>>>
>>>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>>>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>>>> +
>>>>    	/* For match mask every bits indicates the match */
>>>>    	switch (sig_cmp_fn) {
>>>
>>> Can I ask to move arch specific comparison code into some arch-specific headers or so?
>>> It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
>>>
> 
> Hi, apologies for long delay in response.
> 
>   
>> I can easily enough move the compare_signatures into an arm/x86
>> directory, and have a default version in the code.
> 
> Yes, that's what I thought about.
>   
>> The problem would be for bulk lookup. The function is already duplicated
>>    2 times (the l and lf version). If I remove the #ifdefs, I'll need to
>> duplicate them again into 4 nearly identical versions (dense and
>> sparse). The only third options I see would be some preprocessor macro
>> to patch the function, but that looks even dirtier to me.
> 
> Not sure I understood you here: from looking at the code I don't see any
> arch specific ifdefs in bulk_lookup() routines.
> What I am missing here?
>   

Most if not all of those #if are architecture specific. For instance:
#if defined(__ARM_NEON)
#if defined(RTE_HAS_SVE_ACLE)

The main reason there's some #if in bulk lookup is to handle whether the 
function run with dense hitmask or a sparse hitmask.
x86 only support the sparse hitmask version (1 bit data, 1 bit padding) 
but arm support the dense hitmask (every bit count). The later ends up 
being faster.
Splitting bulk_lookup into its sparse and dense variant would be a lot 
of code duplication that I'd prefer to avoid.

What I might be able to do would be move compare_signatures into some 
arch specific version. The function are different enough that it 
wouldn't be too much of a code duplication. I'd argue though that the 
#ifded for NEON and SSE were already there and I only added the SVE variant.

> 
>> I think duplicating the code would be bad, but I can do it if you want.
>> Unless you have a better solution?
>>
>>>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>>>>    	case RTE_HASH_COMPARE_NEON: {
>>>> -		uint16x8_t vmat, x;
>>>> +		uint16x8_t vmat, hit1, hit2;
>>>>    		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
>>>>    		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
>>>>
>>>>    		/* Compare all signatures in the primary bucket */
>>>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
>>>> -		x = vandq_u16(vmat, mask);
>>>> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
>>>> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
>>>> +		hit1 = vandq_u16(vmat, mask);
>>>> +
>>>>    		/* Compare all signatures in the secondary bucket */
>>>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
>>>> -		x = vandq_u16(vmat, mask);
>>>> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
>>>> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
>>>> +		hit2 = vandq_u16(vmat, mask);
>>>> +
>>>> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
>>>> +		hit2 = vorrq_u16(hit1, hit2);
>>>> +		*hitmask_buffer = vaddvq_u16(hit2);
>>>> +		}
>>>> +		break;
>>>> +#endif
>>>> +#if defined(RTE_HAS_SVE_ACLE)
>>>> +	case RTE_HASH_COMPARE_SVE: {
>>>> +		svuint16_t vsign, shift, sv_matches;
>>>> +		svbool_t pred, match, bucket_wide_pred;
>>>> +		int i = 0;
>>>> +		uint64_t vl = svcnth();
>>>> +
>>>> +		vsign = svdup_u16(sig);
>>>> +		shift = svindex_u16(0, 1);
>>>> +
>>>> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
>>>> +			svuint16_t primary_array_vect, secondary_array_vect;
>>>> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
>>>> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
>>>> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
>>>> +
>>>> +			/* We merged the two vectors so we can do both comparison at once */
>>>> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
>>>> +				primary_array_vect,
>>>> +				secondary_array_vect);
>>>> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
>>>> +
>>>> +			/* Compare all signatures in the buckets */
>>>> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
>>>> +			if (svptest_any(svptrue_b16(), match)) {
>>>> +				sv_matches = svdup_u16(1);
>>>> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
>>>> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
>>>> +			}
>>>> +		} else {
>>>> +			do {
>>>> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
>>>> +				uint16_t lower_half = 0;
>>>> +				uint16_t upper_half = 0;
>>>> +				/* Compare all signatures in the primary bucket */
>>>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>>>> +							&prim_bucket_sigs[i]));
>>>> +				if (svptest_any(svptrue_b16(), match)) {
>>>> +					sv_matches = svdup_u16(1);
>>>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>>>> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
>>>> +				}
>>>> +				/* Compare all signatures in the secondary bucket */
>>>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>>>> +							&sec_bucket_sigs[i]));
>>>> +				if (svptest_any(svptrue_b16(), match)) {
>>>> +					sv_matches = svdup_u16(1);
>>>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>>>> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
>>>> +						<< RTE_HASH_BUCKET_ENTRIES;
>>>> +				}
>>>> +				hitmask_buffer[i/8] = upper_half | lower_half;
>>>> +				i += vl;
>>>> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
>>>> +		}
>>>>    		}
>>>>    		break;
>>>> +#endif
>>>>    	default:
>>>>    		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>>>> -			*prim_hash_matches |=
>>>> -				((sig == prim_bkt->sig_current[i]) << i);
>>>> -			*sec_hash_matches |=
>>>> -				((sig == sec_bkt->sig_current[i]) << i);
>>>> +			*hitmask_buffer |=
>>>> +				((sig == prim_bucket_sigs[i]) << i);
>>>> +			*hitmask_buffer |=
>>>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>>>>    		}
>>>>    	}
>>>>    }
>>>> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
>>>>
>>>>    	/* For match mask the first bit of every two bits indicates the match */
>>>>    	switch (sig_cmp_fn) {
>>>> -#if defined(__SSE2__)
>>>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>>>>    	case RTE_HASH_COMPARE_SSE:
>>>>    		/* Compare all signatures in the bucket */
>>>>    		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>>>> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    	uint64_t hits = 0;
>>>>    	int32_t i;
>>>>    	int32_t ret;
>>>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>>>
>>>>    #if defined(__ARM_NEON)
>>>>    	const int hitmask_padding = 0;
>>>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +
>>>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>>>> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
>>>>    #else
>>>>    	const int hitmask_padding = 1;
>>>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    #endif
>>>>
>>>>    	__hash_rw_reader_lock(h);
>>>> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    	/* Compare signatures and prefetch key slot of first hit */
>>>>    	for (i = 0; i < num_keys; i++) {
>>>>    #if defined(__ARM_NEON)
>>>> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>>>> -			primary_bkt[i], secondary_bkt[i],
>>>> +		uint16_t *hitmask = &hitmask_buffer[i];
>>>> +		compare_signatures_dense(hitmask,
>>>> +			primary_bkt[i]->sig_current,
>>>> +			secondary_bkt[i]->sig_current,
>>>>    			sig[i], h->sig_cmp_fn);
>>>> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>>    #else
>>>> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>>>> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>>>    			primary_bkt[i], secondary_bkt[i],
>>>>    			sig[i], h->sig_cmp_fn);
>>>> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>>    #endif
>>>>
>>>> -		if (prim_hitmask[i]) {
>>>> +		if (prim_hitmask) {
>>>>    			uint32_t first_hit =
>>>> -					rte_ctz32(prim_hitmask[i])
>>>> +					rte_ctz32(prim_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				primary_bkt[i]->key_idx[first_hit];
>>>> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    			continue;
>>>>    		}
>>>>
>>>> -		if (sec_hitmask[i]) {
>>>> +		if (sec_hitmask) {
>>>>    			uint32_t first_hit =
>>>> -					rte_ctz32(sec_hitmask[i])
>>>> +					rte_ctz32(sec_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				secondary_bkt[i]->key_idx[first_hit];
>>>> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    	/* Compare keys, first hits in primary first */
>>>>    	for (i = 0; i < num_keys; i++) {
>>>>    		positions[i] = -ENOENT;
>>>> -		while (prim_hitmask[i]) {
>>>> +#if defined(__ARM_NEON)
>>>> +		uint16_t *hitmask = &hitmask_buffer[i];
>>>> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>> +#else
>>>> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>> +#endif
>>>> +		while (prim_hitmask) {
>>>>    			uint32_t hit_index =
>>>> -					rte_ctz32(prim_hitmask[i])
>>>> +					rte_ctz32(prim_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				primary_bkt[i]->key_idx[hit_index];
>>>> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    				positions[i] = key_idx - 1;
>>>>    				goto next_key;
>>>>    			}
>>>> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    		}
>>>>
>>>> -		while (sec_hitmask[i]) {
>>>> +		while (sec_hitmask) {
>>>>    			uint32_t hit_index =
>>>> -					rte_ctz32(sec_hitmask[i])
>>>> +					rte_ctz32(sec_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				secondary_bkt[i]->key_idx[hit_index];
>>>> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    				positions[i] = key_idx - 1;
>>>>    				goto next_key;
>>>>    			}
>>>> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    		}
>>>>    next_key:
>>>>    		continue;
>>>> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    	uint64_t hits = 0;
>>>>    	int32_t i;
>>>>    	int32_t ret;
>>>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>>>    	uint32_t cnt_b, cnt_a;
>>>>
>>>>    #if defined(__ARM_NEON)
>>>>    	const int hitmask_padding = 0;
>>>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>>>> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
>>>>    #else
>>>>    	const int hitmask_padding = 1;
>>>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    #endif
>>>>
>>>>    	for (i = 0; i < num_keys; i++)
>>>> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    		/* Compare signatures and prefetch key slot of first hit */
>>>>    		for (i = 0; i < num_keys; i++) {
>>>>    #if defined(__ARM_NEON)
>>>> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>>>> -				primary_bkt[i], secondary_bkt[i],
>>>> +			uint16_t *hitmask = &hitmask_buffer[i];
>>>> +			compare_signatures_dense(hitmask,
>>>> +				primary_bkt[i]->sig_current,
>>>> +				secondary_bkt[i]->sig_current,
>>>>    				sig[i], h->sig_cmp_fn);
>>>> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>>    #else
>>>> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>>>> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>>>    				primary_bkt[i], secondary_bkt[i],
>>>>    				sig[i], h->sig_cmp_fn);
>>>> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>>    #endif
>>>>
>>>> -			if (prim_hitmask[i]) {
>>>> +			if (prim_hitmask) {
>>>>    				uint32_t first_hit =
>>>> -						rte_ctz32(prim_hitmask[i])
>>>> +						rte_ctz32(prim_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    					primary_bkt[i]->key_idx[first_hit];
>>>> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    				continue;
>>>>    			}
>>>>
>>>> -			if (sec_hitmask[i]) {
>>>> +			if (sec_hitmask) {
>>>>    				uint32_t first_hit =
>>>> -						rte_ctz32(sec_hitmask[i])
>>>> +						rte_ctz32(sec_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    					secondary_bkt[i]->key_idx[first_hit];
>>>> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>
>>>>    		/* Compare keys, first hits in primary first */
>>>>    		for (i = 0; i < num_keys; i++) {
>>>> -			while (prim_hitmask[i]) {
>>>> +#if defined(__ARM_NEON)
>>>> +			uint16_t *hitmask = &hitmask_buffer[i];
>>>> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>> +#else
>>>> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>> +#endif
>>>> +			while (prim_hitmask) {
>>>>    				uint32_t hit_index =
>>>> -						rte_ctz32(prim_hitmask[i])
>>>> +						rte_ctz32(prim_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    				rte_atomic_load_explicit(
>>>> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    					positions[i] = key_idx - 1;
>>>>    					goto next_key;
>>>>    				}
>>>> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    			}
>>>>
>>>> -			while (sec_hitmask[i]) {
>>>> +			while (sec_hitmask) {
>>>>    				uint32_t hit_index =
>>>> -						rte_ctz32(sec_hitmask[i])
>>>> +						rte_ctz32(sec_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    				rte_atomic_load_explicit(
>>>> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    					positions[i] = key_idx - 1;
>>>>    					goto next_key;
>>>>    				}
>>>> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    			}
>>>>    next_key:
>>>>    			continue;
>>>> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
>>>> index 8ea793c66e..ed18e1f41e 100644
>>>> --- a/lib/hash/rte_cuckoo_hash.h
>>>> +++ b/lib/hash/rte_cuckoo_hash.h
>>>> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
>>>>    	RTE_HASH_COMPARE_SCALAR = 0,
>>>>    	RTE_HASH_COMPARE_SSE,
>>>>    	RTE_HASH_COMPARE_NEON,
>>>> +	RTE_HASH_COMPARE_SVE,
>>>>    	RTE_HASH_COMPARE_NUM
>>>>    };
>>>>
>>>> --
>>>> 2.34.1
>>>
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v6 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (4 preceding siblings ...)
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
@ 2024-03-11 23:21 ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
                   ` (2 subsequent siblings)
  8 siblings, 4 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

V3->V4:
  Rebase

V4->V5:
  Commit message

V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 197 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 392 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 195 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 6 files changed, 258 insertions(+), 92 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..1af6ba8190
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..dcf9444032
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..e41f03270a 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -442,8 +450,9 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1857,63 +1866,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1876,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1924,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1941,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1974,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2003,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2053,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2080,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2110,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2126,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2163,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2196,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v6 2/4] hash: optimize compare signature for NEON
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 1af6ba8190..b5a457f936 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v6 4/4] hash: add SVE support for bulk key lookup
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-03-11 23:21   ` [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-12  3:57     ` fengchengwen
  3 siblings, 1 reply; 43+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang

- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  2 +
 2 files changed, 60 insertions(+)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index b5a457f936..8a0627e119 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index e41f03270a..7a474267f0 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -452,6 +452,8 @@ rte_hash_create(const struct rte_hash_parameters *params)
 #elif defined(RTE_ARCH_ARM64)
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
 	}
 	else
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 4/4] hash: add SVE support for bulk key lookup
  2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-03-12  3:57     ` fengchengwen
  2024-03-12 15:08       ` Yoan Picchi
  0 siblings, 1 reply; 43+ messages in thread
From: fengchengwen @ 2024-03-12  3:57 UTC (permalink / raw)
  To: Yoan Picchi, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang

Hi Yoan,

On 2024/3/12 7:21, Yoan Picchi wrote:
> - Implemented SVE code for comparing signatures in bulk lookup.
> - Added Defines in code for SVE code support.
> - Optimise NEON code

This commit does not include this part. Pls only describe the content in this commit.

> - New SVE code is ~5% slower than optimized NEON for N2 processor.
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
>  lib/hash/rte_cuckoo_hash.c             |  2 +
>  2 files changed, 60 insertions(+)
> 
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> index b5a457f936..8a0627e119 100644
> --- a/lib/hash/arch/arm/compare_signatures.h
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
>  		*hitmask_buffer = vaddvq_u16(hit2);
>  		}
>  		break;
> +#endif
> +#if defined(RTE_HAS_SVE_ACLE)
> +	case RTE_HASH_COMPARE_SVE: {

...

>  #endif
>  	default:
>  		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> index e41f03270a..7a474267f0 100644
> --- a/lib/hash/rte_cuckoo_hash.c
> +++ b/lib/hash/rte_cuckoo_hash.c
> @@ -452,6 +452,8 @@ rte_hash_create(const struct rte_hash_parameters *params)
>  #elif defined(RTE_ARCH_ARM64)
>  	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;

The RTE_HASH_COMPARE_SVE was defined in "PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup",
but its first use is in this commit, so I think it should defined in this commit.

If RTE_CPUFLAG_SVE and RTE_HAS_SVE_ACLE both set, then SVE impl will be chosen.
If RTE_CPUFLAG_SVE defined, but RTE_HAS_SVE_ACLE was not, then scalar will be chosen. --- in this case we could back to NEON impl.
So I suggest direct use "#if defined(RTE_HAS_SVE_ACLE)" here.

>  	}
>  	else
>  #endif
> 

Plus:
I notice the commit log said the SVE performance is slower than NEON.

And I also notice other platform SVE also lower than NEON,
1. b4ee9c07bd config/arm: disable SVE ACLE for CN10K
2. 4eea7c6461 config/arm: add SVE ACLE control flag

So maybe we should disable RTE_HAS_SVE_ACLE default by:
diff --git a/config/arm/meson.build b/config/arm/meson.build
index 9d6fb87d7f..a5b890d100 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -875,7 +875,7 @@ endif

 if cc.get_define('__ARM_FEATURE_SVE', args: machine_args) != ''
     compile_time_cpuflags += ['RTE_CPUFLAG_SVE']
-    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', true))
+    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', false))
         dpdk_conf.set('RTE_HAS_SVE_ACLE', 1)
     endif
 endif

If the platform verify SVE has higher performance, then it could enable SVE by add "sve_acle: true" in soc_xxx config.

Thanks

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 4/4] hash: add SVE support for bulk key lookup
  2024-03-12  3:57     ` fengchengwen
@ 2024-03-12 15:08       ` Yoan Picchi
  0 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:08 UTC (permalink / raw)
  To: fengchengwen, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang

On 3/12/24 03:57, fengchengwen wrote:
> Hi Yoan,
> 
> On 2024/3/12 7:21, Yoan Picchi wrote:
>> - Implemented SVE code for comparing signatures in bulk lookup.
>> - Added Defines in code for SVE code support.
>> - Optimise NEON code
> 
> This commit does not include this part. Pls only describe the content in this commit.

Thank you. I forgot to edit that out after moving commit around.

> 
>> - New SVE code is ~5% slower than optimized NEON for N2 processor.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>>   lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
>>   lib/hash/rte_cuckoo_hash.c             |  2 +
>>   2 files changed, 60 insertions(+)
>>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> index b5a457f936..8a0627e119 100644
>> --- a/lib/hash/arch/arm/compare_signatures.h
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
>>   		*hitmask_buffer = vaddvq_u16(hit2);
>>   		}
>>   		break;
>> +#endif
>> +#if defined(RTE_HAS_SVE_ACLE)
>> +	case RTE_HASH_COMPARE_SVE: {
> 
> ...
> 
>>   #endif
>>   	default:
>>   		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
>> index e41f03270a..7a474267f0 100644
>> --- a/lib/hash/rte_cuckoo_hash.c
>> +++ b/lib/hash/rte_cuckoo_hash.c
>> @@ -452,6 +452,8 @@ rte_hash_create(const struct rte_hash_parameters *params)
>>   #elif defined(RTE_ARCH_ARM64)
>>   	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
>> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
>> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> 
> The RTE_HASH_COMPARE_SVE was defined in "PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup",
> but its first use is in this commit, so I think it should defined in this commit.
> 
> If RTE_CPUFLAG_SVE and RTE_HAS_SVE_ACLE both set, then SVE impl will be chosen.
> If RTE_CPUFLAG_SVE defined, but RTE_HAS_SVE_ACLE was not, then scalar will be chosen. --- in this case we could back to NEON impl.
> So I suggest direct use "#if defined(RTE_HAS_SVE_ACLE)" here.

Sounds fair. I'll do it.

> 
>>   	}
>>   	else
>>   #endif
>>
> 
> Plus:
> I notice the commit log said the SVE performance is slower than NEON.
> 
> And I also notice other platform SVE also lower than NEON,
> 1. b4ee9c07bd config/arm: disable SVE ACLE for CN10K
> 2. 4eea7c6461 config/arm: add SVE ACLE control flag
> 
> So maybe we should disable RTE_HAS_SVE_ACLE default by:
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 9d6fb87d7f..a5b890d100 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -875,7 +875,7 @@ endif
> 
>   if cc.get_define('__ARM_FEATURE_SVE', args: machine_args) != ''
>       compile_time_cpuflags += ['RTE_CPUFLAG_SVE']
> -    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', true))
> +    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', false))
>           dpdk_conf.set('RTE_HAS_SVE_ACLE', 1)
>       endif
>   endif
> 
> If the platform verify SVE has higher performance, then it could enable SVE by add "sve_acle: true" in soc_xxx config.
> 
> Thanks

Here I kinda disagree. In this particular instance, SVE is a bit slower 
with narrow vectors (128b), but could be faster with some wider vector 
sizes.
Even in general SVE 128b is not just slower than neon. It's a case by 
case basis. Sometime it's slower, sometime it's faster, so I don't think 
we should just disable it by default. In any case, disabling it should 
be its own patch with much discussion, not just a offhand thing we 
include in the middle of this patch.
This SVE version is still faster than the upstream neon version. I just 
happen to have improved the neon version even more.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v7 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (5 preceding siblings ...)
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
@ 2024-03-12 15:42 ` Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
  8 siblings, 4 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

V3->V4:
  Rebase

V4->V5:
  Commit message

V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE

V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 394 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  2024-03-19 10:41     ` Konstantin Ananyev
  2024-03-19 16:09     ` Stephen Hemminger
  2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 255 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..1af6ba8190
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..dcf9444032
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v7 2/4] hash: optimize compare signature for NEON
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  2024-03-20  7:37     ` [EXTERNAL] " Pavan Nikhilesh Bhagavatula
  2024-03-12 15:42   ` [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 1 reply; 43+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 1af6ba8190..b5a457f936 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v7 4/4] hash: add SVE support for bulk key lookup
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-03-12 15:42   ` [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang

- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  7 +++-
 lib/hash/rte_cuckoo_hash.h             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index b5a457f936..8a0627e119 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0697743cdf..75f555ba2c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-03-19 10:41     ` Konstantin Ananyev
  2024-03-19 13:09       ` Yoan Picchi
  2024-03-19 16:09     ` Stephen Hemminger
  1 sibling, 1 reply; 43+ messages in thread
From: Konstantin Ananyev @ 2024-03-19 10:41 UTC (permalink / raw)
  To: Yoan Picchi, Thomas Monjalon, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown


Hi,

> Current hitmask includes padding due to Intel's SIMD
> implementation detail. This patch allows non Intel SIMD
> implementations to benefit from a dense hitmask.
> In addition, the new dense hitmask interweave the primary
> and secondary matches which allow a better cache usage and
> enable future improvements for the SIMD implementations
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> ---
>  .mailmap                                  |   2 +
>  lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>  lib/hash/arch/common/compare_signatures.h |  38 +++++
>  lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>  lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>  5 files changed, 255 insertions(+), 91 deletions(-)
>  create mode 100644 lib/hash/arch/arm/compare_signatures.h
>  create mode 100644 lib/hash/arch/common/compare_signatures.h
>  create mode 100644 lib/hash/arch/x86/compare_signatures.h
> 
> diff --git a/.mailmap b/.mailmap
> index 66ebc20666..00b50414d3 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>  Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>  Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>  Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> +Harjot Singh <harjot.singh@arm.com>
>  Harman Kalra <hkalra@marvell.com>
>  Harneet Singh <harneet.singh@intel.com>
>  Harold Huang <baymaxhuang@gmail.com>
> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>  Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>  Yi Zhang <zhang.yi75@zte.com.cn>
>  Yoann Desmouceaux <ydesmouc@cisco.com>
> +Yoan Picchi <yoan.picchi@arm.com>
>  Yogesh Jangra <yogesh.jangra@intel.com>
>  Yogev Chaimovich <yogev@cgstowernetworks.com>
>  Yongjie Gu <yongjiex.gu@intel.com>
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> new file mode 100644
> index 0000000000..1af6ba8190
> --- /dev/null
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -0,0 +1,61 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * Arm's version uses a densely packed hitmask buffer:
> + * Every bit is in use.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
> +	/* For match mask every bits indicates the match */
> +	switch (sig_cmp_fn) {
> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> +	case RTE_HASH_COMPARE_NEON: {
> +		uint16x8_t vmat, vsig, x;
> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> +		uint16_t low, high;
> +
> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
> +		/* Compare all signatures in the primary bucket */
> +		vmat = vceqq_u16(vsig,
> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +		low = (uint16_t)(vaddvq_u16(x));
> +		/* Compare all signatures in the secondary bucket */
> +		vmat = vceqq_u16(vsig,
> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +		high = (uint16_t)(vaddvq_u16(x));
> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +
> +		}
> +		break;
> +#endif
> +	default:
> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			*hitmask_buffer |=
> +				((sig == prim_bucket_sigs[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +		}
> +	}
> +}
> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> new file mode 100644
> index 0000000000..dcf9444032
> --- /dev/null
> +++ b/lib/hash/arch/common/compare_signatures.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> + * but the dense one is slightly faster.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	(void) sig_cmp_fn;
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
> +	/* For match mask every bits indicates the match */
> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +		*hitmask_buffer |=
> +			((sig == prim_bucket_sigs[i]) << i);
> +		*hitmask_buffer |=
> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +	}
> +
> +}

Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
One question I have - does it mean that now for x86 we always use 'sparse' while for all other
ARM and non-ARM platforms we switch to 'dense'?

> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
> new file mode 100644
> index 0000000000..7eec499e1f
> --- /dev/null
> +++ b/lib/hash/arch/x86/compare_signatures.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * x86's version uses a sparsely packed hitmask buffer:
> + * Every other bit is padding.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 0
> +
> +static inline void
> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> +			const struct rte_hash_bucket *prim_bkt,
> +			const struct rte_hash_bucket *sec_bkt,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	/* For match mask the first bit of every two bits indicates the match */
> +	switch (sig_cmp_fn) {
> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> +	case RTE_HASH_COMPARE_SSE:
> +		/* Compare all signatures in the bucket */
> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> +				_mm_load_si128(
> +					(__m128i const *)prim_bkt->sig_current),
> +				_mm_set1_epi16(sig)));
> +		/* Extract the even-index bits only */
> +		*prim_hash_matches &= 0x5555;
> +		/* Compare all signatures in the bucket */
> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> +				_mm_load_si128(
> +					(__m128i const *)sec_bkt->sig_current),
> +				_mm_set1_epi16(sig)));
> +		/* Extract the even-index bits only */
> +		*sec_hash_matches &= 0x5555;
> +		break;
> +#endif /* defined(__SSE2__) */
> +	default:
> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			*prim_hash_matches |=
> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
> +			*sec_hash_matches |=
> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
> +		}
> +	}
> +}

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-19 10:41     ` Konstantin Ananyev
@ 2024-03-19 13:09       ` Yoan Picchi
  2024-03-19 13:25         ` Konstantin Ananyev
  0 siblings, 1 reply; 43+ messages in thread
From: Yoan Picchi @ 2024-03-19 13:09 UTC (permalink / raw)
  To: Konstantin Ananyev, Yoan Picchi, Thomas Monjalon, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown

On 3/19/24 10:41, Konstantin Ananyev wrote:
> 
> Hi,
> 
>> Current hitmask includes padding due to Intel's SIMD
>> implementation detail. This patch allows non Intel SIMD
>> implementations to benefit from a dense hitmask.
>> In addition, the new dense hitmask interweave the primary
>> and secondary matches which allow a better cache usage and
>> enable future improvements for the SIMD implementations
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> ---
>>   .mailmap                                  |   2 +
>>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>>   lib/hash/arch/common/compare_signatures.h |  38 +++++
>>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>>   5 files changed, 255 insertions(+), 91 deletions(-)
>>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
>>   create mode 100644 lib/hash/arch/common/compare_signatures.h
>>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
>>
>> diff --git a/.mailmap b/.mailmap
>> index 66ebc20666..00b50414d3 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>>   Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>>   Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>>   Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
>> +Harjot Singh <harjot.singh@arm.com>
>>   Harman Kalra <hkalra@marvell.com>
>>   Harneet Singh <harneet.singh@intel.com>
>>   Harold Huang <baymaxhuang@gmail.com>
>> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>>   Yi Zhang <zhang.yi75@zte.com.cn>
>>   Yoann Desmouceaux <ydesmouc@cisco.com>
>> +Yoan Picchi <yoan.picchi@arm.com>
>>   Yogesh Jangra <yogesh.jangra@intel.com>
>>   Yogev Chaimovich <yogev@cgstowernetworks.com>
>>   Yongjie Gu <yongjiex.gu@intel.com>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> new file mode 100644
>> index 0000000000..1af6ba8190
>> --- /dev/null
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -0,0 +1,61 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * Arm's version uses a densely packed hitmask buffer:
>> + * Every bit is in use.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_NEON: {
>> +		uint16x8_t vmat, vsig, x;
>> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> +		uint16_t low, high;
>> +
>> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>> +		/* Compare all signatures in the primary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		low = (uint16_t)(vaddvq_u16(x));
>> +		/* Compare all signatures in the secondary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		high = (uint16_t)(vaddvq_u16(x));
>> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +
>> +		}
>> +		break;
>> +#endif
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*hitmask_buffer |=
>> +				((sig == prim_bucket_sigs[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +		}
>> +	}
>> +}
>> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
>> new file mode 100644
>> index 0000000000..dcf9444032
>> --- /dev/null
>> +++ b/lib/hash/arch/common/compare_signatures.h
>> @@ -0,0 +1,38 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * The generic version could use either a dense or sparsely packed hitmask buffer,
>> + * but the dense one is slightly faster.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	(void) sig_cmp_fn;
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +		*hitmask_buffer |=
>> +			((sig == prim_bucket_sigs[i]) << i);
>> +		*hitmask_buffer |=
>> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +	}
>> +
>> +}
> 
> Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> ARM and non-ARM platforms we switch to 'dense'?

Yes it does. x86 support only the sparse method (the legacy one). Arm 
and generic code could support both dense and sparse. The reason I made 
them use the dense method is because it was slightly faster in my tests. 
(no need to add padding and shifts amongst other benefit.)

> 
>> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
>> new file mode 100644
>> index 0000000000..7eec499e1f
>> --- /dev/null
>> +++ b/lib/hash/arch/x86/compare_signatures.h
>> @@ -0,0 +1,53 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * x86's version uses a sparsely packed hitmask buffer:
>> + * Every other bit is padding.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 0
>> +
>> +static inline void
>> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> +			const struct rte_hash_bucket *prim_bkt,
>> +			const struct rte_hash_bucket *sec_bkt,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	/* For match mask the first bit of every two bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_SSE:
>> +		/* Compare all signatures in the bucket */
>> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)prim_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*prim_hash_matches &= 0x5555;
>> +		/* Compare all signatures in the bucket */
>> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)sec_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*sec_hash_matches &= 0x5555;
>> +		break;
>> +#endif /* defined(__SSE2__) */
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*prim_hash_matches |=
>> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
>> +			*sec_hash_matches |=
>> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
>> +		}
>> +	}
>> +}


^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-19 13:09       ` Yoan Picchi
@ 2024-03-19 13:25         ` Konstantin Ananyev
  0 siblings, 0 replies; 43+ messages in thread
From: Konstantin Ananyev @ 2024-03-19 13:25 UTC (permalink / raw)
  To: Yoan Picchi, Yoan Picchi, Thomas Monjalon, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown



> >
> > Hi,
> >
> >> Current hitmask includes padding due to Intel's SIMD
> >> implementation detail. This patch allows non Intel SIMD
> >> implementations to benefit from a dense hitmask.
> >> In addition, the new dense hitmask interweave the primary
> >> and secondary matches which allow a better cache usage and
> >> enable future improvements for the SIMD implementations
> >>
> >> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> >> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> >> ---
> >>   .mailmap                                  |   2 +
> >>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
> >>   lib/hash/arch/common/compare_signatures.h |  38 +++++
> >>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
> >>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
> >>   5 files changed, 255 insertions(+), 91 deletions(-)
> >>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
> >>   create mode 100644 lib/hash/arch/common/compare_signatures.h
> >>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
> >>
> >> diff --git a/.mailmap b/.mailmap
> >> index 66ebc20666..00b50414d3 100644
> >> --- a/.mailmap
> >> +++ b/.mailmap
> >> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
> >>   Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
> >>   Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
> >>   Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> >> +Harjot Singh <harjot.singh@arm.com>
> >>   Harman Kalra <hkalra@marvell.com>
> >>   Harneet Singh <harneet.singh@intel.com>
> >>   Harold Huang <baymaxhuang@gmail.com>
> >> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
> >>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
> >>   Yi Zhang <zhang.yi75@zte.com.cn>
> >>   Yoann Desmouceaux <ydesmouc@cisco.com>
> >> +Yoan Picchi <yoan.picchi@arm.com>
> >>   Yogesh Jangra <yogesh.jangra@intel.com>
> >>   Yogev Chaimovich <yogev@cgstowernetworks.com>
> >>   Yongjie Gu <yongjiex.gu@intel.com>
> >> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..1af6ba8190
> >> --- /dev/null
> >> +++ b/lib/hash/arch/arm/compare_signatures.h
> >> @@ -0,0 +1,61 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * Arm's version uses a densely packed hitmask buffer:
> >> + * Every bit is in use.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 1
> >> +
> >> +static inline void
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >> +	/* For match mask every bits indicates the match */
> >> +	switch (sig_cmp_fn) {
> >> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> >> +	case RTE_HASH_COMPARE_NEON: {
> >> +		uint16x8_t vmat, vsig, x;
> >> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> >> +		uint16_t low, high;
> >> +
> >> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
> >> +		/* Compare all signatures in the primary bucket */
> >> +		vmat = vceqq_u16(vsig,
> >> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> >> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +		low = (uint16_t)(vaddvq_u16(x));
> >> +		/* Compare all signatures in the secondary bucket */
> >> +		vmat = vceqq_u16(vsig,
> >> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> >> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +		high = (uint16_t)(vaddvq_u16(x));
> >> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> >> +
> >> +		}
> >> +		break;
> >> +#endif
> >> +	default:
> >> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +			*hitmask_buffer |=
> >> +				((sig == prim_bucket_sigs[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +		}
> >> +	}
> >> +}
> >> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..dcf9444032
> >> --- /dev/null
> >> +++ b/lib/hash/arch/common/compare_signatures.h
> >> @@ -0,0 +1,38 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> >> + * but the dense one is slightly faster.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 1
> >> +
> >> +static inline void
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +	(void) sig_cmp_fn;
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >> +	/* For match mask every bits indicates the match */
> >> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +		*hitmask_buffer |=
> >> +			((sig == prim_bucket_sigs[i]) << i);
> >> +		*hitmask_buffer |=
> >> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +	}
> >> +
> >> +}
> >
> > Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> > One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> > ARM and non-ARM platforms we switch to 'dense'?
> 
> Yes it does. x86 support only the sparse method (the legacy one). Arm
> and generic code could support both dense and sparse. The reason I made
> them use the dense method is because it was slightly faster in my tests.

Ok, but before that, a 'generic' one (non-x86 and non-ARM) used 'sparse' one, correct?
If so, then probably need to outline it a bit more in patch comments and might be even release notes.
At least that would be my expectations, probably hash lib maintainers need to say what is the best way here.
The code refactoring itself - LGTM.

> (no need to add padding and shifts amongst other benefit.)
> 
> >
> >> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..7eec499e1f
> >> --- /dev/null
> >> +++ b/lib/hash/arch/x86/compare_signatures.h
> >> @@ -0,0 +1,53 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * x86's version uses a sparsely packed hitmask buffer:
> >> + * Every other bit is padding.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 0
> >> +
> >> +static inline void
> >> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> >> +			const struct rte_hash_bucket *prim_bkt,
> >> +			const struct rte_hash_bucket *sec_bkt,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +	/* For match mask the first bit of every two bits indicates the match */
> >> +	switch (sig_cmp_fn) {
> >> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> >> +	case RTE_HASH_COMPARE_SSE:
> >> +		/* Compare all signatures in the bucket */
> >> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> +				_mm_load_si128(
> >> +					(__m128i const *)prim_bkt->sig_current),
> >> +				_mm_set1_epi16(sig)));
> >> +		/* Extract the even-index bits only */
> >> +		*prim_hash_matches &= 0x5555;
> >> +		/* Compare all signatures in the bucket */
> >> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> +				_mm_load_si128(
> >> +					(__m128i const *)sec_bkt->sig_current),
> >> +				_mm_set1_epi16(sig)));
> >> +		/* Extract the even-index bits only */
> >> +		*sec_hash_matches &= 0x5555;
> >> +		break;
> >> +#endif /* defined(__SSE2__) */
> >> +	default:
> >> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +			*prim_hash_matches |=
> >> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
> >> +			*sec_hash_matches |=
> >> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
> >> +		}
> >> +	}
> >> +}


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-03-19 10:41     ` Konstantin Ananyev
@ 2024-03-19 16:09     ` Stephen Hemminger
  1 sibling, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2024-03-19 16:09 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown

On Tue, 12 Mar 2024 15:42:12 +0000
Yoan Picchi <yoan.picchi@arm.com> wrote:

> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),

Space around math operations please.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* RE: [EXTERNAL] [PATCH v7 2/4] hash: optimize compare signature for NEON
  2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-03-20  7:37     ` Pavan Nikhilesh Bhagavatula
  2024-04-11 13:32       ` Yoan Picchi
  0 siblings, 1 reply; 43+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2024-03-20  7:37 UTC (permalink / raw)
  To: Yoan Picchi, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown, Jerin Jacob

> Upon a successful comparison, NEON sets all the bits in the lane to 1
> We can skip shifting by simply masking with specific masks.
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> ---
>  lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
>  1 file changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/hash/arch/arm/compare_signatures.h
> b/lib/hash/arch/arm/compare_signatures.h
> index 1af6ba8190..b5a457f936 100644
> --- a/lib/hash/arch/arm/compare_signatures.h
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t
> *hitmask_buffer,
>  	switch (sig_cmp_fn) {
>  #if RTE_HASH_BUCKET_ENTRIES <= 8
>  	case RTE_HASH_COMPARE_NEON: {
> -		uint16x8_t vmat, vsig, x;
> -		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> -		uint16_t low, high;
> +		uint16x8_t vmat, hit1, hit2;
> +		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20,
> 0x40, 0x80};
> +		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const
> *)&sig);
> 
> -		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>  		/* Compare all signatures in the primary bucket */
> -		vmat = vceqq_u16(vsig,
> -			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
> shift);
> -		low = (uint16_t)(vaddvq_u16(x));
> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
> +		hit1 = vandq_u16(vmat, mask);
> +
>  		/* Compare all signatures in the secondary bucket */
> -		vmat = vceqq_u16(vsig,
> -			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
> shift);
> -		high = (uint16_t)(vaddvq_u16(x));
> -		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
> +		hit2 = vandq_u16(vmat, mask);
> 
> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
> +		hit2 = vorrq_u16(hit1, hit2);
> +		*hitmask_buffer = vaddvq_u16(hit2);

Since vaddv is expensive could you convert it to vshrn?

https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon

https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_neon.h#L226

>  		}
>  		break;
>  #endif
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [EXTERNAL] [PATCH v7 2/4] hash: optimize compare signature for NEON
  2024-03-20  7:37     ` [EXTERNAL] " Pavan Nikhilesh Bhagavatula
@ 2024-04-11 13:32       ` Yoan Picchi
  0 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-11 13:32 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Yoan Picchi, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown, Jerin Jacob

On 3/20/24 07:37, Pavan Nikhilesh Bhagavatula wrote:
>> Upon a successful comparison, NEON sets all the bits in the lane to 1
>> We can skip shifting by simply masking with specific masks.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> ---
>>   lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
>>   1 file changed, 11 insertions(+), 13 deletions(-)
>>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h
>> b/lib/hash/arch/arm/compare_signatures.h
>> index 1af6ba8190..b5a457f936 100644
>> --- a/lib/hash/arch/arm/compare_signatures.h
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t
>> *hitmask_buffer,
>>   	switch (sig_cmp_fn) {
>>   #if RTE_HASH_BUCKET_ENTRIES <= 8
>>   	case RTE_HASH_COMPARE_NEON: {
>> -		uint16x8_t vmat, vsig, x;
>> -		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> -		uint16_t low, high;
>> +		uint16x8_t vmat, hit1, hit2;
>> +		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20,
>> 0x40, 0x80};
>> +		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const
>> *)&sig);
>>
>> -		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>>   		/* Compare all signatures in the primary bucket */
>> -		vmat = vceqq_u16(vsig,
>> -			vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
>> shift);
>> -		low = (uint16_t)(vaddvq_u16(x));
>> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
>> +		hit1 = vandq_u16(vmat, mask);
>> +
>>   		/* Compare all signatures in the secondary bucket */
>> -		vmat = vceqq_u16(vsig,
>> -			vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
>> shift);
>> -		high = (uint16_t)(vaddvq_u16(x));
>> -		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
>> +		hit2 = vandq_u16(vmat, mask);
>>
>> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
>> +		hit2 = vorrq_u16(hit1, hit2);
>> +		*hitmask_buffer = vaddvq_u16(hit2);
> 
> Since vaddv is expensive could you convert it to vshrn?
> 
> https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
> 
> https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_neon.h#L226

Thank you for those links, it was a good read.
Unfortunatly I don't think it is a good use case here. A decent part of 
the speedup I get is by using a dense hitmask: ie every bit count with 
no padding. Using the vshrn would have 4 bits of padding, and stripping 
them would be more expensive than using a regular reduce.

> 
>>   		}
>>   		break;
>>   #endif
>> --
>> 2.25.1
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v8 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (6 preceding siblings ...)
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
@ 2024-04-17 16:08 ` Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
  8 siblings, 4 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

V3->V4:
  Rebase

V4->V5:
  Commit message

V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE

V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain

V7->V8:
  Commit message
  Typos and missing spaces

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 394 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  2024-04-17 18:12     ` Stephen Hemminger
  2024-04-17 16:08   ` [PATCH v8 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 43+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 255 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..63eb341d0e
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..59157d31e1
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v8 2/4] hash: optimize compare signature for NEON
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 63eb341d0e..2601ed68b3 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v8 4/4] hash: add SVE support for bulk key lookup
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-04-17 16:08   ` [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang

- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  7 +++-
 lib/hash/rte_cuckoo_hash.h             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 2601ed68b3..140ff97b1d 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0697743cdf..75f555ba2c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-04-17 18:12     ` Stephen Hemminger
  0 siblings, 0 replies; 43+ messages in thread
From: Stephen Hemminger @ 2024-04-17 18:12 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown

On Wed, 17 Apr 2024 16:08:04 +0000
Yoan Picchi <yoan.picchi@arm.com> wrote:

> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> new file mode 100644
> index 0000000000..59157d31e1
> --- /dev/null
> +++ b/lib/hash/arch/common/compare_signatures.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> + * but the dense one is slightly faster.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	(void) sig_cmp_fn;

Please use __rte_unused attribute for this.
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");

The message should be indented like multi-line function args.
If possible shorten the message.

> +
> +	/* For match mask every bits indicates the match */
> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +		*hitmask_buffer |=
> +			((sig == prim_bucket_sigs[i]) << i);

Don't really need () around the whole expression here.

> +		*hitmask_buffer |=
> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +	}
> +
> +}

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v9 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (7 preceding siblings ...)
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
@ 2024-04-30 16:27 ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  8 siblings, 4 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi

This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.

Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)

V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix

V3->V4:
  Rebase

V4->V5:
  Commit message

V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE

V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain

V7->V8:
  Commit message
  Typos and missing spaces

V8->V9:
  Use __rte_unused instead of (void)
  Fix an indentation mistake

Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup

 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  37 ++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 393 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  37 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 254 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h

diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..46d15da89f
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+				  "hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				(sig == prim_bucket_sigs[i]) << i;
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..f43b367005
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			__rte_unused enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+				  "hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			(sig == prim_bucket_sigs[i]) << i;
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..db342804e1
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				(sig == prim_bkt->sig_current[i]) << (i << 1);
+			*sec_hash_matches |=
+				(sig == sec_bkt->sig_current[i]) << (i << 1);
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v9 2/4] hash: optimize compare signature for NEON
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown

Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 46d15da89f..72bd171484 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown

This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(&params_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v9 4/4] hash: add SVE support for bulk key lookup
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-04-30 16:27   ` [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  3 siblings, 0 replies; 43+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang

- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  7 +++-
 lib/hash/rte_cuckoo_hash.h             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 72bd171484..b4b4cf04e9 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0697743cdf..75f555ba2c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2024-04-30 16:28 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
2023-10-20 16:51 ` [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2023-10-20 16:51 ` [PATCH v2 2/4] hash: optimize compare signature for NEON Yoan Picchi
2023-10-20 16:51 ` [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2023-10-20 16:51 ` [PATCH v2 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2024-02-27 17:42   ` [PATCH v5 2/4] hash: optimize compare signature for NEON Yoan Picchi
2024-02-27 17:42   ` [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
2024-02-28 10:56     ` Konstantin Ananyev
2024-02-28 14:48       ` Yoan Picchi
2024-03-04 13:35         ` Konstantin Ananyev
2024-03-05 15:36           ` Yoan Picchi
2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2024-03-11 23:21   ` [PATCH v6 2/4] hash: optimize compare signature for NEON Yoan Picchi
2024-03-11 23:21   ` [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
2024-03-12  3:57     ` fengchengwen
2024-03-12 15:08       ` Yoan Picchi
2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2024-03-19 10:41     ` Konstantin Ananyev
2024-03-19 13:09       ` Yoan Picchi
2024-03-19 13:25         ` Konstantin Ananyev
2024-03-19 16:09     ` Stephen Hemminger
2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
2024-03-20  7:37     ` [EXTERNAL] " Pavan Nikhilesh Bhagavatula
2024-04-11 13:32       ` Yoan Picchi
2024-03-12 15:42   ` [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2024-03-12 15:42   ` [PATCH v7 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2024-04-17 18:12     ` Stephen Hemminger
2024-04-17 16:08   ` [PATCH v8 2/4] hash: optimize compare signature for NEON Yoan Picchi
2024-04-17 16:08   ` [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2024-04-17 16:08   ` [PATCH v8 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
2024-04-30 16:27   ` [PATCH v9 2/4] hash: optimize compare signature for NEON Yoan Picchi
2024-04-30 16:27   ` [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
2024-04-30 16:27   ` [PATCH v9 4/4] hash: add SVE support for bulk key lookup Yoan Picchi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).