- * [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 2/4] hash: optimize compare signature for NEON Yoan Picchi
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, Yoan Picchi
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)
diff --git a/.mailmap b/.mailmap
index 3f5bab26a8..b9c49aa7f6 100644
--- a/.mailmap
+++ b/.mailmap
@@ -485,6 +485,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1602,6 +1603,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 19b23f2a97..2aa96eb862 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1850,8 +1850,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << i);
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << i);
+		}
+	}
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
@@ -1878,25 +1920,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 		/* Extract the even-index bits only */
 		*sec_hash_matches &= 0x5555;
 		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
+#endif /* defined(__SSE2__) */
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
 			*prim_hash_matches |=
@@ -1907,6 +1931,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 	}
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1921,18 +1947,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+			primary_bkt[i], secondary_bkt[i],
+			sig[i], h->sig_cmp_fn);
+#else
+		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+#endif
 
 		if (prim_hitmask[i]) {
 			uint32_t first_hit =
 					__builtin_ctzl(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1946,7 +1984,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		if (sec_hitmask[i]) {
 			uint32_t first_hit =
 					__builtin_ctzl(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1963,7 +2001,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		while (prim_hitmask[i]) {
 			uint32_t hit_index =
 					__builtin_ctzl(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1985,13 +2023,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 
 		while (sec_hitmask[i]) {
 			uint32_t hit_index =
 					__builtin_ctzl(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2014,7 +2052,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2069,6 +2107,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2082,14 +2126,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+#else
+			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+				primary_bkt[i], secondary_bkt[i],
+				sig[i], h->sig_cmp_fn);
+#endif
 
 			if (prim_hitmask[i]) {
 				uint32_t first_hit =
 						__builtin_ctzl(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2103,7 +2153,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			if (sec_hitmask[i]) {
 				uint32_t first_hit =
 						__builtin_ctzl(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2119,7 +2169,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2145,13 +2195,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 
 			while (sec_hitmask[i]) {
 				uint32_t hit_index =
 						__builtin_ctzl(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2178,7 +2228,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v2 2/4] hash: optimize compare signature for NEON
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, Yoan Picchi
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 2aa96eb862..a4b907c45c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1864,19 +1864,17 @@ compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16x8_t vmat, x;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
 		}
 		break;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2023-10-20 16:51 ` [PATCH v2 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, Yoan Picchi, Harjot Singh
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..b6e22c5ecc 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | 3 << 16;
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	if (ret == 0)
+		for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+			print_key_info("Blk_Lkp", key_array[i], pos[i]);
+			RETURN_IF_ERROR(pos[i] != expected_pos[i],
+					"failed to find key (pos[%u]=%d)", i, pos[i]);
+		}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	if (ret == 0)
+		for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+			print_key_info("Blk_Lkp", key_array[i], pos[i]);
+			RETURN_IF_ERROR(pos[i] != -ENOENT,
+					"failed to find key (pos[%u]=%d)", i, pos[i]);
+		}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v2 4/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (2 preceding siblings ...)
  2023-10-20 16:51 ` [PATCH v2 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2023-10-20 16:51 ` Yoan Picchi
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2023-10-20 16:51 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, Yoan Picchi, Harjot Singh
- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~3% slower than optimized NEON for N2 processor.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
 lib/hash/rte_cuckoo_hash.h |   1 +
 2 files changed, 151 insertions(+), 46 deletions(-)
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a4b907c45c..cda39d1441 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -435,8 +435,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1853,37 +1856,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 #if defined(__ARM_NEON)
 
 static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
 	unsigned int i;
 
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
+#if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, x;
+		uint16x8_t vmat, hit1, hit2;
 		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
+
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
+		}
+		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				int lower_half = 0;
+				int upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				*(hitmask_buffer+(i/8)) = lower_half | upper_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
 		}
 		break;
+#endif
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << i);
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << i);
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
@@ -1901,7 +1970,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1941,14 +2010,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	__hash_rw_reader_lock(h);
@@ -1956,18 +2029,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-			primary_bkt[i], secondary_bkt[i],
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					__builtin_ctzl(prim_hitmask[i])
+					__builtin_ctzl(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
@@ -1979,9 +2058,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					__builtin_ctzl(sec_hitmask[i])
+					__builtin_ctzl(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
@@ -1996,9 +2075,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					__builtin_ctzl(prim_hitmask[i])
+					__builtin_ctzl(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
@@ -2021,12 +2108,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					__builtin_ctzl(sec_hitmask[i])
+					__builtin_ctzl(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
@@ -2050,7 +2137,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2100,15 +2187,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	for (i = 0; i < num_keys; i++)
@@ -2125,18 +2215,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-				primary_bkt[i], secondary_bkt[i],
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						__builtin_ctzl(prim_hitmask[i])
+						__builtin_ctzl(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
@@ -2148,9 +2244,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						__builtin_ctzl(sec_hitmask[i])
+						__builtin_ctzl(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
@@ -2164,9 +2260,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						__builtin_ctzl(prim_hitmask[i])
+						__builtin_ctzl(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
@@ -2193,12 +2297,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						__builtin_ctzl(sec_hitmask[i])
+						__builtin_ctzl(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				__atomic_load_n(
@@ -2226,7 +2330,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index eb2644f74b..356ec2a69e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -148,6 +148,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v5 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (3 preceding siblings ...)
  2023-10-20 16:51 ` [PATCH v2 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-02-27 17:41 ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
                   ` (6 subsequent siblings)
  11 siblings, 4 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:41 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
From: Yoan Picchi <yoan.picchi@foss.arm.com>
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                   |   2 +
 app/test/test_hash.c       |  99 ++++++++++----
 lib/hash/rte_cuckoo_hash.c | 264 +++++++++++++++++++++++++++++--------
 lib/hash/rte_cuckoo_hash.h |   1 +
 4 files changed, 287 insertions(+), 79 deletions(-)
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                   |   2 +
 lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 34 deletions(-)
diff --git a/.mailmap b/.mailmap
index 12d2875641..60500bbe36 100644
--- a/.mailmap
+++ b/.mailmap
@@ -492,6 +492,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1625,6 +1626,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0550165584 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1857,8 +1857,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
+#if defined(__ARM_NEON)
+
+static inline void
+compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << i);
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << i);
+		}
+	}
+}
+
+#else
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
@@ -1885,25 +1927,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 		/* Extract the even-index bits only */
 		*sec_hash_matches &= 0x5555;
 		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
+#endif /* defined(__SSE2__) */
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
 			*prim_hash_matches |=
@@ -1914,6 +1938,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 	}
 }
 
+#endif /* defined(__ARM_NEON) */
+
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1928,18 +1954,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
+			primary_bkt[i], secondary_bkt[i],
+			sig[i], h->sig_cmp_fn);
+#else
+		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+#endif
 
 		if (prim_hitmask[i]) {
 			uint32_t first_hit =
 					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1953,7 +1991,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		if (sec_hitmask[i]) {
 			uint32_t first_hit =
 					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1970,7 +2008,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		while (prim_hitmask[i]) {
 			uint32_t hit_index =
 					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +2030,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 
 		while (sec_hitmask[i]) {
 			uint32_t hit_index =
 					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2059,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2076,6 +2114,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if defined(__ARM_NEON)
+	const int hitmask_padding = 0;
+#else
+	const int hitmask_padding = 1;
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2133,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if defined(__ARM_NEON)
+			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+#else
+			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+				primary_bkt[i], secondary_bkt[i],
+				sig[i], h->sig_cmp_fn);
+#endif
 
 			if (prim_hitmask[i]) {
 				uint32_t first_hit =
 						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2110,7 +2160,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			if (sec_hitmask[i]) {
 				uint32_t first_hit =
 						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2126,7 +2176,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 			while (prim_hitmask[i]) {
 				uint32_t hit_index =
 						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2202,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 
 			while (sec_hitmask[i]) {
 				uint32_t hit_index =
 						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2235,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v5 2/4] hash: optimize compare signature for NEON
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0550165584..a07dd3a28d 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1871,19 +1871,17 @@ compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16x8_t vmat, x;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vandq_u16(vmat, mask);
 		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
 		}
 		break;
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..c4e7f8190e 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | 3 << 16;
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-02-27 17:42   ` [PATCH v5 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-02-27 17:42   ` Yoan Picchi
  2024-02-28 10:56     ` Konstantin Ananyev
  3 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-02-27 17:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
 lib/hash/rte_cuckoo_hash.h |   1 +
 2 files changed, 151 insertions(+), 46 deletions(-)
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index a07dd3a28d..231d6d6ded 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 #if defined(__ARM_NEON)
 
 static inline void
-compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
 	unsigned int i;
 
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
 	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, x;
+		uint16x8_t vmat, hit1, hit2;
 		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vandq_u16(vmat, mask);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
+
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
+		}
+		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
 		}
 		break;
+#endif
 	default:
 		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << i);
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << i);
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
@@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
@@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	__hash_rw_reader_lock(h);
@@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-			primary_bkt[i], secondary_bkt[i],
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
+					rte_ctz32(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
@@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
+					rte_ctz32(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
@@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
+					rte_ctz32(prim_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
@@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
+					rte_ctz32(sec_hitmask)
 					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
@@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
 #if defined(__ARM_NEON)
 	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
 #else
 	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 #endif
 
 	for (i = 0; i < num_keys; i++)
@@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
 #if defined(__ARM_NEON)
-			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
-				primary_bkt[i], secondary_bkt[i],
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
 #else
-			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
 #endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
+						rte_ctz32(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
@@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
+						rte_ctz32(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
@@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if defined(__ARM_NEON)
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
+						rte_ctz32(prim_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
@@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
+						rte_ctz32(sec_hitmask)
 						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
@@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index 8ea793c66e..ed18e1f41e 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * RE: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-27 17:42   ` [PATCH v5 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-02-28 10:56     ` Konstantin Ananyev
  2024-02-28 14:48       ` Yoan Picchi
  0 siblings, 1 reply; 73+ messages in thread
From: Konstantin Ananyev @ 2024-02-28 10:56 UTC (permalink / raw)
  To: Yoan Picchi, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
> 
> - Implemented SVE code for comparing signatures in bulk lookup.
> - Added Defines in code for SVE code support.
> - Optimise NEON code
> - New SVE code is ~5% slower than optimized NEON for N2 processor.
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
>  lib/hash/rte_cuckoo_hash.h |   1 +
>  2 files changed, 151 insertions(+), 46 deletions(-)
> 
> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> index a07dd3a28d..231d6d6ded 100644
> --- a/lib/hash/rte_cuckoo_hash.c
> +++ b/lib/hash/rte_cuckoo_hash.c
> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>  	else
>  #elif defined(RTE_ARCH_ARM64)
> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> +	}
>  	else
>  #endif
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>  #if defined(__ARM_NEON)
> 
>  static inline void
> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> -			const struct rte_hash_bucket *prim_bkt,
> -			const struct rte_hash_bucket *sec_bkt,
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
>  			uint16_t sig,
>  			enum rte_hash_sig_compare_function sig_cmp_fn)
>  {
>  	unsigned int i;
> 
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
>  	/* For match mask every bits indicates the match */
>  	switch (sig_cmp_fn) {
Can I ask to move arch specific comparison code into some arch-specific headers or so?
It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>  	case RTE_HASH_COMPARE_NEON: {
> -		uint16x8_t vmat, x;
> +		uint16x8_t vmat, hit1, hit2;
>  		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
>  		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
> 
>  		/* Compare all signatures in the primary bucket */
> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
> -		x = vandq_u16(vmat, mask);
> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
> +		hit1 = vandq_u16(vmat, mask);
> +
>  		/* Compare all signatures in the secondary bucket */
> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
> -		x = vandq_u16(vmat, mask);
> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
> +		hit2 = vandq_u16(vmat, mask);
> +
> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
> +		hit2 = vorrq_u16(hit1, hit2);
> +		*hitmask_buffer = vaddvq_u16(hit2);
> +		}
> +		break;
> +#endif
> +#if defined(RTE_HAS_SVE_ACLE)
> +	case RTE_HASH_COMPARE_SVE: {
> +		svuint16_t vsign, shift, sv_matches;
> +		svbool_t pred, match, bucket_wide_pred;
> +		int i = 0;
> +		uint64_t vl = svcnth();
> +
> +		vsign = svdup_u16(sig);
> +		shift = svindex_u16(0, 1);
> +
> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
> +			svuint16_t primary_array_vect, secondary_array_vect;
> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
> +
> +			/* We merged the two vectors so we can do both comparison at once */
> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
> +				primary_array_vect,
> +				secondary_array_vect);
> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
> +
> +			/* Compare all signatures in the buckets */
> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
> +			if (svptest_any(svptrue_b16(), match)) {
> +				sv_matches = svdup_u16(1);
> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
> +			}
> +		} else {
> +			do {
> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
> +				uint16_t lower_half = 0;
> +				uint16_t upper_half = 0;
> +				/* Compare all signatures in the primary bucket */
> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> +							&prim_bucket_sigs[i]));
> +				if (svptest_any(svptrue_b16(), match)) {
> +					sv_matches = svdup_u16(1);
> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
> +				}
> +				/* Compare all signatures in the secondary bucket */
> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> +							&sec_bucket_sigs[i]));
> +				if (svptest_any(svptrue_b16(), match)) {
> +					sv_matches = svdup_u16(1);
> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
> +						<< RTE_HASH_BUCKET_ENTRIES;
> +				}
> +				hitmask_buffer[i/8] = upper_half | lower_half;
> +				i += vl;
> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
> +		}
>  		}
>  		break;
> +#endif
>  	default:
>  		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> -			*prim_hash_matches |=
> -				((sig == prim_bkt->sig_current[i]) << i);
> -			*sec_hash_matches |=
> -				((sig == sec_bkt->sig_current[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == prim_bucket_sigs[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>  		}
>  	}
>  }
> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
> 
>  	/* For match mask the first bit of every two bits indicates the match */
>  	switch (sig_cmp_fn) {
> -#if defined(__SSE2__)
> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>  	case RTE_HASH_COMPARE_SSE:
>  		/* Compare all signatures in the bucket */
>  		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  	uint64_t hits = 0;
>  	int32_t i;
>  	int32_t ret;
> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  	struct rte_hash_bucket *cur_bkt, *next_bkt;
> 
>  #if defined(__ARM_NEON)
>  	const int hitmask_padding = 0;
> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +
> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
>  #else
>  	const int hitmask_padding = 1;
> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  #endif
> 
>  	__hash_rw_reader_lock(h);
> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  	/* Compare signatures and prefetch key slot of first hit */
>  	for (i = 0; i < num_keys; i++) {
>  #if defined(__ARM_NEON)
> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> -			primary_bkt[i], secondary_bkt[i],
> +		uint16_t *hitmask = &hitmask_buffer[i];
> +		compare_signatures_dense(hitmask,
> +			primary_bkt[i]->sig_current,
> +			secondary_bkt[i]->sig_current,
>  			sig[i], h->sig_cmp_fn);
> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>  #else
> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>  			primary_bkt[i], secondary_bkt[i],
>  			sig[i], h->sig_cmp_fn);
> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>  #endif
> 
> -		if (prim_hitmask[i]) {
> +		if (prim_hitmask) {
>  			uint32_t first_hit =
> -					rte_ctz32(prim_hitmask[i])
> +					rte_ctz32(prim_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				primary_bkt[i]->key_idx[first_hit];
> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  			continue;
>  		}
> 
> -		if (sec_hitmask[i]) {
> +		if (sec_hitmask) {
>  			uint32_t first_hit =
> -					rte_ctz32(sec_hitmask[i])
> +					rte_ctz32(sec_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				secondary_bkt[i]->key_idx[first_hit];
> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  	/* Compare keys, first hits in primary first */
>  	for (i = 0; i < num_keys; i++) {
>  		positions[i] = -ENOENT;
> -		while (prim_hitmask[i]) {
> +#if defined(__ARM_NEON)
> +		uint16_t *hitmask = &hitmask_buffer[i];
> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> +#else
> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
> +#endif
> +		while (prim_hitmask) {
>  			uint32_t hit_index =
> -					rte_ctz32(prim_hitmask[i])
> +					rte_ctz32(prim_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				primary_bkt[i]->key_idx[hit_index];
> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  				positions[i] = key_idx - 1;
>  				goto next_key;
>  			}
> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  		}
> 
> -		while (sec_hitmask[i]) {
> +		while (sec_hitmask) {
>  			uint32_t hit_index =
> -					rte_ctz32(sec_hitmask[i])
> +					rte_ctz32(sec_hitmask)
>  					>> hitmask_padding;
>  			uint32_t key_idx =
>  				secondary_bkt[i]->key_idx[hit_index];
> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>  				positions[i] = key_idx - 1;
>  				goto next_key;
>  			}
> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  		}
>  next_key:
>  		continue;
> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  	uint64_t hits = 0;
>  	int32_t i;
>  	int32_t ret;
> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  	struct rte_hash_bucket *cur_bkt, *next_bkt;
>  	uint32_t cnt_b, cnt_a;
> 
>  #if defined(__ARM_NEON)
>  	const int hitmask_padding = 0;
> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
>  #else
>  	const int hitmask_padding = 1;
> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  #endif
> 
>  	for (i = 0; i < num_keys; i++)
> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  		/* Compare signatures and prefetch key slot of first hit */
>  		for (i = 0; i < num_keys; i++) {
>  #if defined(__ARM_NEON)
> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> -				primary_bkt[i], secondary_bkt[i],
> +			uint16_t *hitmask = &hitmask_buffer[i];
> +			compare_signatures_dense(hitmask,
> +				primary_bkt[i]->sig_current,
> +				secondary_bkt[i]->sig_current,
>  				sig[i], h->sig_cmp_fn);
> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>  #else
> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>  				primary_bkt[i], secondary_bkt[i],
>  				sig[i], h->sig_cmp_fn);
> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>  #endif
> 
> -			if (prim_hitmask[i]) {
> +			if (prim_hitmask) {
>  				uint32_t first_hit =
> -						rte_ctz32(prim_hitmask[i])
> +						rte_ctz32(prim_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  					primary_bkt[i]->key_idx[first_hit];
> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  				continue;
>  			}
> 
> -			if (sec_hitmask[i]) {
> +			if (sec_hitmask) {
>  				uint32_t first_hit =
> -						rte_ctz32(sec_hitmask[i])
> +						rte_ctz32(sec_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  					secondary_bkt[i]->key_idx[first_hit];
> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> 
>  		/* Compare keys, first hits in primary first */
>  		for (i = 0; i < num_keys; i++) {
> -			while (prim_hitmask[i]) {
> +#if defined(__ARM_NEON)
> +			uint16_t *hitmask = &hitmask_buffer[i];
> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> +#else
> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
> +#endif
> +			while (prim_hitmask) {
>  				uint32_t hit_index =
> -						rte_ctz32(prim_hitmask[i])
> +						rte_ctz32(prim_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  				rte_atomic_load_explicit(
> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  					positions[i] = key_idx - 1;
>  					goto next_key;
>  				}
> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  			}
> 
> -			while (sec_hitmask[i]) {
> +			while (sec_hitmask) {
>  				uint32_t hit_index =
> -						rte_ctz32(sec_hitmask[i])
> +						rte_ctz32(sec_hitmask)
>  						>> hitmask_padding;
>  				uint32_t key_idx =
>  				rte_atomic_load_explicit(
> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>  					positions[i] = key_idx - 1;
>  					goto next_key;
>  				}
> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>  			}
>  next_key:
>  			continue;
> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
> index 8ea793c66e..ed18e1f41e 100644
> --- a/lib/hash/rte_cuckoo_hash.h
> +++ b/lib/hash/rte_cuckoo_hash.h
> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
>  	RTE_HASH_COMPARE_SCALAR = 0,
>  	RTE_HASH_COMPARE_SSE,
>  	RTE_HASH_COMPARE_NEON,
> +	RTE_HASH_COMPARE_SVE,
>  	RTE_HASH_COMPARE_NUM
>  };
> 
> --
> 2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-28 10:56     ` Konstantin Ananyev
@ 2024-02-28 14:48       ` Yoan Picchi
  2024-03-04 13:35         ` Konstantin Ananyev
  0 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-02-28 14:48 UTC (permalink / raw)
  To: Konstantin Ananyev, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
On 2/28/24 10:56, Konstantin Ananyev wrote:
> 
>>
>> - Implemented SVE code for comparing signatures in bulk lookup.
>> - Added Defines in code for SVE code support.
>> - Optimise NEON code
>> - New SVE code is ~5% slower than optimized NEON for N2 processor.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>>   lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
>>   lib/hash/rte_cuckoo_hash.h |   1 +
>>   2 files changed, 151 insertions(+), 46 deletions(-)
>>
>> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
>> index a07dd3a28d..231d6d6ded 100644
>> --- a/lib/hash/rte_cuckoo_hash.c
>> +++ b/lib/hash/rte_cuckoo_hash.c
>> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>>   	else
>>   #elif defined(RTE_ARCH_ARM64)
>> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
>> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
>> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
>> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
>> +	}
>>   	else
>>   #endif
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
>> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>>   #if defined(__ARM_NEON)
>>
>>   static inline void
>> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> -			const struct rte_hash_bucket *prim_bkt,
>> -			const struct rte_hash_bucket *sec_bkt,
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>>   			uint16_t sig,
>>   			enum rte_hash_sig_compare_function sig_cmp_fn)
>>   {
>>   	unsigned int i;
>>
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>>   	/* For match mask every bits indicates the match */
>>   	switch (sig_cmp_fn) {
> 
> Can I ask to move arch specific comparison code into some arch-specific headers or so?
> It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
> 
I can easily enough move the compare_signatures into an arm/x86 
directory, and have a default version in the code.
The problem would be for bulk lookup. The function is already duplicated 
  2 times (the l and lf version). If I remove the #ifdefs, I'll need to 
duplicate them again into 4 nearly identical versions (dense and 
sparse). The only third options I see would be some preprocessor macro 
to patch the function, but that looks even dirtier to me.
I think duplicating the code would be bad, but I can do it if you want. 
Unless you have a better solution?
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>>   	case RTE_HASH_COMPARE_NEON: {
>> -		uint16x8_t vmat, x;
>> +		uint16x8_t vmat, hit1, hit2;
>>   		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
>>   		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
>>
>>   		/* Compare all signatures in the primary bucket */
>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
>> -		x = vandq_u16(vmat, mask);
>> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
>> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
>> +		hit1 = vandq_u16(vmat, mask);
>> +
>>   		/* Compare all signatures in the secondary bucket */
>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
>> -		x = vandq_u16(vmat, mask);
>> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
>> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
>> +		hit2 = vandq_u16(vmat, mask);
>> +
>> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
>> +		hit2 = vorrq_u16(hit1, hit2);
>> +		*hitmask_buffer = vaddvq_u16(hit2);
>> +		}
>> +		break;
>> +#endif
>> +#if defined(RTE_HAS_SVE_ACLE)
>> +	case RTE_HASH_COMPARE_SVE: {
>> +		svuint16_t vsign, shift, sv_matches;
>> +		svbool_t pred, match, bucket_wide_pred;
>> +		int i = 0;
>> +		uint64_t vl = svcnth();
>> +
>> +		vsign = svdup_u16(sig);
>> +		shift = svindex_u16(0, 1);
>> +
>> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
>> +			svuint16_t primary_array_vect, secondary_array_vect;
>> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
>> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
>> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
>> +
>> +			/* We merged the two vectors so we can do both comparison at once */
>> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
>> +				primary_array_vect,
>> +				secondary_array_vect);
>> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
>> +
>> +			/* Compare all signatures in the buckets */
>> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
>> +			if (svptest_any(svptrue_b16(), match)) {
>> +				sv_matches = svdup_u16(1);
>> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
>> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
>> +			}
>> +		} else {
>> +			do {
>> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
>> +				uint16_t lower_half = 0;
>> +				uint16_t upper_half = 0;
>> +				/* Compare all signatures in the primary bucket */
>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>> +							&prim_bucket_sigs[i]));
>> +				if (svptest_any(svptrue_b16(), match)) {
>> +					sv_matches = svdup_u16(1);
>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
>> +				}
>> +				/* Compare all signatures in the secondary bucket */
>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>> +							&sec_bucket_sigs[i]));
>> +				if (svptest_any(svptrue_b16(), match)) {
>> +					sv_matches = svdup_u16(1);
>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
>> +						<< RTE_HASH_BUCKET_ENTRIES;
>> +				}
>> +				hitmask_buffer[i/8] = upper_half | lower_half;
>> +				i += vl;
>> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
>> +		}
>>   		}
>>   		break;
>> +#endif
>>   	default:
>>   		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> -			*prim_hash_matches |=
>> -				((sig == prim_bkt->sig_current[i]) << i);
>> -			*sec_hash_matches |=
>> -				((sig == sec_bkt->sig_current[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == prim_bucket_sigs[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>>   		}
>>   	}
>>   }
>> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
>>
>>   	/* For match mask the first bit of every two bits indicates the match */
>>   	switch (sig_cmp_fn) {
>> -#if defined(__SSE2__)
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>>   	case RTE_HASH_COMPARE_SSE:
>>   		/* Compare all signatures in the bucket */
>>   		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   	uint64_t hits = 0;
>>   	int32_t i;
>>   	int32_t ret;
>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>
>>   #if defined(__ARM_NEON)
>>   	const int hitmask_padding = 0;
>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +
>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
>>   #else
>>   	const int hitmask_padding = 1;
>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   #endif
>>
>>   	__hash_rw_reader_lock(h);
>> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   	/* Compare signatures and prefetch key slot of first hit */
>>   	for (i = 0; i < num_keys; i++) {
>>   #if defined(__ARM_NEON)
>> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>> -			primary_bkt[i], secondary_bkt[i],
>> +		uint16_t *hitmask = &hitmask_buffer[i];
>> +		compare_signatures_dense(hitmask,
>> +			primary_bkt[i]->sig_current,
>> +			secondary_bkt[i]->sig_current,
>>   			sig[i], h->sig_cmp_fn);
>> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>   #else
>> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>   			primary_bkt[i], secondary_bkt[i],
>>   			sig[i], h->sig_cmp_fn);
>> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>   #endif
>>
>> -		if (prim_hitmask[i]) {
>> +		if (prim_hitmask) {
>>   			uint32_t first_hit =
>> -					rte_ctz32(prim_hitmask[i])
>> +					rte_ctz32(prim_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				primary_bkt[i]->key_idx[first_hit];
>> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   			continue;
>>   		}
>>
>> -		if (sec_hitmask[i]) {
>> +		if (sec_hitmask) {
>>   			uint32_t first_hit =
>> -					rte_ctz32(sec_hitmask[i])
>> +					rte_ctz32(sec_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				secondary_bkt[i]->key_idx[first_hit];
>> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   	/* Compare keys, first hits in primary first */
>>   	for (i = 0; i < num_keys; i++) {
>>   		positions[i] = -ENOENT;
>> -		while (prim_hitmask[i]) {
>> +#if defined(__ARM_NEON)
>> +		uint16_t *hitmask = &hitmask_buffer[i];
>> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>> +#else
>> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
>> +#endif
>> +		while (prim_hitmask) {
>>   			uint32_t hit_index =
>> -					rte_ctz32(prim_hitmask[i])
>> +					rte_ctz32(prim_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				primary_bkt[i]->key_idx[hit_index];
>> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   				positions[i] = key_idx - 1;
>>   				goto next_key;
>>   			}
>> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   		}
>>
>> -		while (sec_hitmask[i]) {
>> +		while (sec_hitmask) {
>>   			uint32_t hit_index =
>> -					rte_ctz32(sec_hitmask[i])
>> +					rte_ctz32(sec_hitmask)
>>   					>> hitmask_padding;
>>   			uint32_t key_idx =
>>   				secondary_bkt[i]->key_idx[hit_index];
>> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>   				positions[i] = key_idx - 1;
>>   				goto next_key;
>>   			}
>> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   		}
>>   next_key:
>>   		continue;
>> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   	uint64_t hits = 0;
>>   	int32_t i;
>>   	int32_t ret;
>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>   	uint32_t cnt_b, cnt_a;
>>
>>   #if defined(__ARM_NEON)
>>   	const int hitmask_padding = 0;
>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
>>   #else
>>   	const int hitmask_padding = 1;
>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>   #endif
>>
>>   	for (i = 0; i < num_keys; i++)
>> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   		/* Compare signatures and prefetch key slot of first hit */
>>   		for (i = 0; i < num_keys; i++) {
>>   #if defined(__ARM_NEON)
>> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>> -				primary_bkt[i], secondary_bkt[i],
>> +			uint16_t *hitmask = &hitmask_buffer[i];
>> +			compare_signatures_dense(hitmask,
>> +				primary_bkt[i]->sig_current,
>> +				secondary_bkt[i]->sig_current,
>>   				sig[i], h->sig_cmp_fn);
>> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>   #else
>> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>   				primary_bkt[i], secondary_bkt[i],
>>   				sig[i], h->sig_cmp_fn);
>> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>   #endif
>>
>> -			if (prim_hitmask[i]) {
>> +			if (prim_hitmask) {
>>   				uint32_t first_hit =
>> -						rte_ctz32(prim_hitmask[i])
>> +						rte_ctz32(prim_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   					primary_bkt[i]->key_idx[first_hit];
>> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   				continue;
>>   			}
>>
>> -			if (sec_hitmask[i]) {
>> +			if (sec_hitmask) {
>>   				uint32_t first_hit =
>> -						rte_ctz32(sec_hitmask[i])
>> +						rte_ctz32(sec_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   					secondary_bkt[i]->key_idx[first_hit];
>> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>
>>   		/* Compare keys, first hits in primary first */
>>   		for (i = 0; i < num_keys; i++) {
>> -			while (prim_hitmask[i]) {
>> +#if defined(__ARM_NEON)
>> +			uint16_t *hitmask = &hitmask_buffer[i];
>> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>> +#else
>> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
>> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
>> +#endif
>> +			while (prim_hitmask) {
>>   				uint32_t hit_index =
>> -						rte_ctz32(prim_hitmask[i])
>> +						rte_ctz32(prim_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   				rte_atomic_load_explicit(
>> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   					positions[i] = key_idx - 1;
>>   					goto next_key;
>>   				}
>> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   			}
>>
>> -			while (sec_hitmask[i]) {
>> +			while (sec_hitmask) {
>>   				uint32_t hit_index =
>> -						rte_ctz32(sec_hitmask[i])
>> +						rte_ctz32(sec_hitmask)
>>   						>> hitmask_padding;
>>   				uint32_t key_idx =
>>   				rte_atomic_load_explicit(
>> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>   					positions[i] = key_idx - 1;
>>   					goto next_key;
>>   				}
>> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>   			}
>>   next_key:
>>   			continue;
>> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
>> index 8ea793c66e..ed18e1f41e 100644
>> --- a/lib/hash/rte_cuckoo_hash.h
>> +++ b/lib/hash/rte_cuckoo_hash.h
>> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
>>   	RTE_HASH_COMPARE_SCALAR = 0,
>>   	RTE_HASH_COMPARE_SSE,
>>   	RTE_HASH_COMPARE_NEON,
>> +	RTE_HASH_COMPARE_SVE,
>>   	RTE_HASH_COMPARE_NUM
>>   };
>>
>> --
>> 2.34.1
> 
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * RE: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-02-28 14:48       ` Yoan Picchi
@ 2024-03-04 13:35         ` Konstantin Ananyev
  2024-03-05 15:36           ` Yoan Picchi
  0 siblings, 1 reply; 73+ messages in thread
From: Konstantin Ananyev @ 2024-03-04 13:35 UTC (permalink / raw)
  To: Yoan Picchi, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
> >> - Implemented SVE code for comparing signatures in bulk lookup.
> >> - Added Defines in code for SVE code support.
> >> - Optimise NEON code
> >> - New SVE code is ~5% slower than optimized NEON for N2 processor.
> >>
> >> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> >> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> >> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> >> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >> ---
> >>   lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
> >>   lib/hash/rte_cuckoo_hash.h |   1 +
> >>   2 files changed, 151 insertions(+), 46 deletions(-)
> >>
> >> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> >> index a07dd3a28d..231d6d6ded 100644
> >> --- a/lib/hash/rte_cuckoo_hash.c
> >> +++ b/lib/hash/rte_cuckoo_hash.c
> >> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
> >>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
> >>   	else
> >>   #elif defined(RTE_ARCH_ARM64)
> >> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
> >> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
> >>   		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> >> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> >> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> >> +	}
> >>   	else
> >>   #endif
> >>   		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
> >> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
> >>   #if defined(__ARM_NEON)
> >>
> >>   static inline void
> >> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> >> -			const struct rte_hash_bucket *prim_bkt,
> >> -			const struct rte_hash_bucket *sec_bkt,
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >>   			uint16_t sig,
> >>   			enum rte_hash_sig_compare_function sig_cmp_fn)
> >>   {
> >>   	unsigned int i;
> >>
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >>   	/* For match mask every bits indicates the match */
> >>   	switch (sig_cmp_fn) {
> >
> > Can I ask to move arch specific comparison code into some arch-specific headers or so?
> > It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
> >
Hi, apologies for long delay in response. 
 
> I can easily enough move the compare_signatures into an arm/x86
> directory, and have a default version in the code.
Yes, that's what I thought about.
 
> The problem would be for bulk lookup. The function is already duplicated
>   2 times (the l and lf version). If I remove the #ifdefs, I'll need to
> duplicate them again into 4 nearly identical versions (dense and
> sparse). The only third options I see would be some preprocessor macro
> to patch the function, but that looks even dirtier to me.
Not sure I understood you here: from looking at the code I don't see any
arch specific ifdefs in bulk_lookup() routines.
What I am missing here?
 
> I think duplicating the code would be bad, but I can do it if you want.
> Unless you have a better solution?
> 
> >> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> >>   	case RTE_HASH_COMPARE_NEON: {
> >> -		uint16x8_t vmat, x;
> >> +		uint16x8_t vmat, hit1, hit2;
> >>   		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
> >>   		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
> >>
> >>   		/* Compare all signatures in the primary bucket */
> >> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
> >> -		x = vandq_u16(vmat, mask);
> >> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
> >> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
> >> +		hit1 = vandq_u16(vmat, mask);
> >> +
> >>   		/* Compare all signatures in the secondary bucket */
> >> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
> >> -		x = vandq_u16(vmat, mask);
> >> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
> >> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
> >> +		hit2 = vandq_u16(vmat, mask);
> >> +
> >> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
> >> +		hit2 = vorrq_u16(hit1, hit2);
> >> +		*hitmask_buffer = vaddvq_u16(hit2);
> >> +		}
> >> +		break;
> >> +#endif
> >> +#if defined(RTE_HAS_SVE_ACLE)
> >> +	case RTE_HASH_COMPARE_SVE: {
> >> +		svuint16_t vsign, shift, sv_matches;
> >> +		svbool_t pred, match, bucket_wide_pred;
> >> +		int i = 0;
> >> +		uint64_t vl = svcnth();
> >> +
> >> +		vsign = svdup_u16(sig);
> >> +		shift = svindex_u16(0, 1);
> >> +
> >> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
> >> +			svuint16_t primary_array_vect, secondary_array_vect;
> >> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
> >> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
> >> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
> >> +
> >> +			/* We merged the two vectors so we can do both comparison at once */
> >> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
> >> +				primary_array_vect,
> >> +				secondary_array_vect);
> >> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
> >> +
> >> +			/* Compare all signatures in the buckets */
> >> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
> >> +			if (svptest_any(svptrue_b16(), match)) {
> >> +				sv_matches = svdup_u16(1);
> >> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
> >> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
> >> +			}
> >> +		} else {
> >> +			do {
> >> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
> >> +				uint16_t lower_half = 0;
> >> +				uint16_t upper_half = 0;
> >> +				/* Compare all signatures in the primary bucket */
> >> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> >> +							&prim_bucket_sigs[i]));
> >> +				if (svptest_any(svptrue_b16(), match)) {
> >> +					sv_matches = svdup_u16(1);
> >> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> >> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
> >> +				}
> >> +				/* Compare all signatures in the secondary bucket */
> >> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> >> +							&sec_bucket_sigs[i]));
> >> +				if (svptest_any(svptrue_b16(), match)) {
> >> +					sv_matches = svdup_u16(1);
> >> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
> >> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
> >> +						<< RTE_HASH_BUCKET_ENTRIES;
> >> +				}
> >> +				hitmask_buffer[i/8] = upper_half | lower_half;
> >> +				i += vl;
> >> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
> >> +		}
> >>   		}
> >>   		break;
> >> +#endif
> >>   	default:
> >>   		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> -			*prim_hash_matches |=
> >> -				((sig == prim_bkt->sig_current[i]) << i);
> >> -			*sec_hash_matches |=
> >> -				((sig == sec_bkt->sig_current[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == prim_bucket_sigs[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >>   		}
> >>   	}
> >>   }
> >> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
> >>
> >>   	/* For match mask the first bit of every two bits indicates the match */
> >>   	switch (sig_cmp_fn) {
> >> -#if defined(__SSE2__)
> >> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> >>   	case RTE_HASH_COMPARE_SSE:
> >>   		/* Compare all signatures in the bucket */
> >>   		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   	uint64_t hits = 0;
> >>   	int32_t i;
> >>   	int32_t ret;
> >> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
> >>
> >>   #if defined(__ARM_NEON)
> >>   	const int hitmask_padding = 0;
> >> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
> >>   #else
> >>   	const int hitmask_padding = 1;
> >> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   #endif
> >>
> >>   	__hash_rw_reader_lock(h);
> >> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   	/* Compare signatures and prefetch key slot of first hit */
> >>   	for (i = 0; i < num_keys; i++) {
> >>   #if defined(__ARM_NEON)
> >> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> >> -			primary_bkt[i], secondary_bkt[i],
> >> +		uint16_t *hitmask = &hitmask_buffer[i];
> >> +		compare_signatures_dense(hitmask,
> >> +			primary_bkt[i]->sig_current,
> >> +			secondary_bkt[i]->sig_current,
> >>   			sig[i], h->sig_cmp_fn);
> >> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >>   #else
> >> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> >> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
> >>   			primary_bkt[i], secondary_bkt[i],
> >>   			sig[i], h->sig_cmp_fn);
> >> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >>   #endif
> >>
> >> -		if (prim_hitmask[i]) {
> >> +		if (prim_hitmask) {
> >>   			uint32_t first_hit =
> >> -					rte_ctz32(prim_hitmask[i])
> >> +					rte_ctz32(prim_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				primary_bkt[i]->key_idx[first_hit];
> >> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   			continue;
> >>   		}
> >>
> >> -		if (sec_hitmask[i]) {
> >> +		if (sec_hitmask) {
> >>   			uint32_t first_hit =
> >> -					rte_ctz32(sec_hitmask[i])
> >> +					rte_ctz32(sec_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				secondary_bkt[i]->key_idx[first_hit];
> >> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   	/* Compare keys, first hits in primary first */
> >>   	for (i = 0; i < num_keys; i++) {
> >>   		positions[i] = -ENOENT;
> >> -		while (prim_hitmask[i]) {
> >> +#if defined(__ARM_NEON)
> >> +		uint16_t *hitmask = &hitmask_buffer[i];
> >> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >> +#else
> >> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >> +#endif
> >> +		while (prim_hitmask) {
> >>   			uint32_t hit_index =
> >> -					rte_ctz32(prim_hitmask[i])
> >> +					rte_ctz32(prim_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				primary_bkt[i]->key_idx[hit_index];
> >> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   				positions[i] = key_idx - 1;
> >>   				goto next_key;
> >>   			}
> >> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   		}
> >>
> >> -		while (sec_hitmask[i]) {
> >> +		while (sec_hitmask) {
> >>   			uint32_t hit_index =
> >> -					rte_ctz32(sec_hitmask[i])
> >> +					rte_ctz32(sec_hitmask)
> >>   					>> hitmask_padding;
> >>   			uint32_t key_idx =
> >>   				secondary_bkt[i]->key_idx[hit_index];
> >> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
> >>   				positions[i] = key_idx - 1;
> >>   				goto next_key;
> >>   			}
> >> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   		}
> >>   next_key:
> >>   		continue;
> >> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   	uint64_t hits = 0;
> >>   	int32_t i;
> >>   	int32_t ret;
> >> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   	struct rte_hash_bucket *cur_bkt, *next_bkt;
> >>   	uint32_t cnt_b, cnt_a;
> >>
> >>   #if defined(__ARM_NEON)
> >>   	const int hitmask_padding = 0;
> >> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
> >>   #else
> >>   	const int hitmask_padding = 1;
> >> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> >>   #endif
> >>
> >>   	for (i = 0; i < num_keys; i++)
> >> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   		/* Compare signatures and prefetch key slot of first hit */
> >>   		for (i = 0; i < num_keys; i++) {
> >>   #if defined(__ARM_NEON)
> >> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
> >> -				primary_bkt[i], secondary_bkt[i],
> >> +			uint16_t *hitmask = &hitmask_buffer[i];
> >> +			compare_signatures_dense(hitmask,
> >> +				primary_bkt[i]->sig_current,
> >> +				secondary_bkt[i]->sig_current,
> >>   				sig[i], h->sig_cmp_fn);
> >> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >>   #else
> >> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
> >> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
> >>   				primary_bkt[i], secondary_bkt[i],
> >>   				sig[i], h->sig_cmp_fn);
> >> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >>   #endif
> >>
> >> -			if (prim_hitmask[i]) {
> >> +			if (prim_hitmask) {
> >>   				uint32_t first_hit =
> >> -						rte_ctz32(prim_hitmask[i])
> >> +						rte_ctz32(prim_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   					primary_bkt[i]->key_idx[first_hit];
> >> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   				continue;
> >>   			}
> >>
> >> -			if (sec_hitmask[i]) {
> >> +			if (sec_hitmask) {
> >>   				uint32_t first_hit =
> >> -						rte_ctz32(sec_hitmask[i])
> >> +						rte_ctz32(sec_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   					secondary_bkt[i]->key_idx[first_hit];
> >> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>
> >>   		/* Compare keys, first hits in primary first */
> >>   		for (i = 0; i < num_keys; i++) {
> >> -			while (prim_hitmask[i]) {
> >> +#if defined(__ARM_NEON)
> >> +			uint16_t *hitmask = &hitmask_buffer[i];
> >> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
> >> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
> >> +#else
> >> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
> >> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
> >> +#endif
> >> +			while (prim_hitmask) {
> >>   				uint32_t hit_index =
> >> -						rte_ctz32(prim_hitmask[i])
> >> +						rte_ctz32(prim_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   				rte_atomic_load_explicit(
> >> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   					positions[i] = key_idx - 1;
> >>   					goto next_key;
> >>   				}
> >> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   			}
> >>
> >> -			while (sec_hitmask[i]) {
> >> +			while (sec_hitmask) {
> >>   				uint32_t hit_index =
> >> -						rte_ctz32(sec_hitmask[i])
> >> +						rte_ctz32(sec_hitmask)
> >>   						>> hitmask_padding;
> >>   				uint32_t key_idx =
> >>   				rte_atomic_load_explicit(
> >> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
> >>   					positions[i] = key_idx - 1;
> >>   					goto next_key;
> >>   				}
> >> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
> >> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
> >>   			}
> >>   next_key:
> >>   			continue;
> >> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
> >> index 8ea793c66e..ed18e1f41e 100644
> >> --- a/lib/hash/rte_cuckoo_hash.h
> >> +++ b/lib/hash/rte_cuckoo_hash.h
> >> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
> >>   	RTE_HASH_COMPARE_SCALAR = 0,
> >>   	RTE_HASH_COMPARE_SSE,
> >>   	RTE_HASH_COMPARE_NEON,
> >> +	RTE_HASH_COMPARE_SVE,
> >>   	RTE_HASH_COMPARE_NUM
> >>   };
> >>
> >> --
> >> 2.34.1
> >
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v5 4/4] hash: add SVE support for bulk key lookup
  2024-03-04 13:35         ` Konstantin Ananyev
@ 2024-03-05 15:36           ` Yoan Picchi
  0 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-05 15:36 UTC (permalink / raw)
  To: Konstantin Ananyev, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
On 3/4/24 13:35, Konstantin Ananyev wrote:
> 
> 
>>>> - Implemented SVE code for comparing signatures in bulk lookup.
>>>> - Added Defines in code for SVE code support.
>>>> - Optimise NEON code
>>>> - New SVE code is ~5% slower than optimized NEON for N2 processor.
>>>>
>>>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>>>> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
>>>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>>>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>>>> ---
>>>>    lib/hash/rte_cuckoo_hash.c | 196 ++++++++++++++++++++++++++++---------
>>>>    lib/hash/rte_cuckoo_hash.h |   1 +
>>>>    2 files changed, 151 insertions(+), 46 deletions(-)
>>>>
>>>> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
>>>> index a07dd3a28d..231d6d6ded 100644
>>>> --- a/lib/hash/rte_cuckoo_hash.c
>>>> +++ b/lib/hash/rte_cuckoo_hash.c
>>>> @@ -442,8 +442,11 @@ rte_hash_create(const struct rte_hash_parameters *params)
>>>>    		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>>>>    	else
>>>>    #elif defined(RTE_ARCH_ARM64)
>>>> -	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
>>>> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>>>>    		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
>>>> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
>>>> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
>>>> +	}
>>>>    	else
>>>>    #endif
>>>>    		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
>>>> @@ -1860,37 +1863,103 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
>>>>    #if defined(__ARM_NEON)
>>>>
>>>>    static inline void
>>>> -compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>>>> -			const struct rte_hash_bucket *prim_bkt,
>>>> -			const struct rte_hash_bucket *sec_bkt,
>>>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>>>> +			const uint16_t *prim_bucket_sigs,
>>>> +			const uint16_t *sec_bucket_sigs,
>>>>    			uint16_t sig,
>>>>    			enum rte_hash_sig_compare_function sig_cmp_fn)
>>>>    {
>>>>    	unsigned int i;
>>>>
>>>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>>>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>>>> +
>>>>    	/* For match mask every bits indicates the match */
>>>>    	switch (sig_cmp_fn) {
>>>
>>> Can I ask to move arch specific comparison code into some arch-specific headers or so?
>>> It is getting really hard to read and understand the generic code with all these ifdefs and arch specific instructions...
>>>
> 
> Hi, apologies for long delay in response.
> 
>   
>> I can easily enough move the compare_signatures into an arm/x86
>> directory, and have a default version in the code.
> 
> Yes, that's what I thought about.
>   
>> The problem would be for bulk lookup. The function is already duplicated
>>    2 times (the l and lf version). If I remove the #ifdefs, I'll need to
>> duplicate them again into 4 nearly identical versions (dense and
>> sparse). The only third options I see would be some preprocessor macro
>> to patch the function, but that looks even dirtier to me.
> 
> Not sure I understood you here: from looking at the code I don't see any
> arch specific ifdefs in bulk_lookup() routines.
> What I am missing here?
>   
Most if not all of those #if are architecture specific. For instance:
#if defined(__ARM_NEON)
#if defined(RTE_HAS_SVE_ACLE)
The main reason there's some #if in bulk lookup is to handle whether the 
function run with dense hitmask or a sparse hitmask.
x86 only support the sparse hitmask version (1 bit data, 1 bit padding) 
but arm support the dense hitmask (every bit count). The later ends up 
being faster.
Splitting bulk_lookup into its sparse and dense variant would be a lot 
of code duplication that I'd prefer to avoid.
What I might be able to do would be move compare_signatures into some 
arch specific version. The function are different enough that it 
wouldn't be too much of a code duplication. I'd argue though that the 
#ifded for NEON and SSE were already there and I only added the SVE variant.
> 
>> I think duplicating the code would be bad, but I can do it if you want.
>> Unless you have a better solution?
>>
>>>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>>>>    	case RTE_HASH_COMPARE_NEON: {
>>>> -		uint16x8_t vmat, x;
>>>> +		uint16x8_t vmat, hit1, hit2;
>>>>    		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
>>>>    		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
>>>>
>>>>    		/* Compare all signatures in the primary bucket */
>>>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bkt->sig_current));
>>>> -		x = vandq_u16(vmat, mask);
>>>> -		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
>>>> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
>>>> +		hit1 = vandq_u16(vmat, mask);
>>>> +
>>>>    		/* Compare all signatures in the secondary bucket */
>>>> -		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bkt->sig_current));
>>>> -		x = vandq_u16(vmat, mask);
>>>> -		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
>>>> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
>>>> +		hit2 = vandq_u16(vmat, mask);
>>>> +
>>>> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
>>>> +		hit2 = vorrq_u16(hit1, hit2);
>>>> +		*hitmask_buffer = vaddvq_u16(hit2);
>>>> +		}
>>>> +		break;
>>>> +#endif
>>>> +#if defined(RTE_HAS_SVE_ACLE)
>>>> +	case RTE_HASH_COMPARE_SVE: {
>>>> +		svuint16_t vsign, shift, sv_matches;
>>>> +		svbool_t pred, match, bucket_wide_pred;
>>>> +		int i = 0;
>>>> +		uint64_t vl = svcnth();
>>>> +
>>>> +		vsign = svdup_u16(sig);
>>>> +		shift = svindex_u16(0, 1);
>>>> +
>>>> +		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
>>>> +			svuint16_t primary_array_vect, secondary_array_vect;
>>>> +			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
>>>> +			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
>>>> +			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
>>>> +
>>>> +			/* We merged the two vectors so we can do both comparison at once */
>>>> +			primary_array_vect = svsplice_u16(bucket_wide_pred,
>>>> +				primary_array_vect,
>>>> +				secondary_array_vect);
>>>> +			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
>>>> +
>>>> +			/* Compare all signatures in the buckets */
>>>> +			match = svcmpeq_u16(pred, vsign, primary_array_vect);
>>>> +			if (svptest_any(svptrue_b16(), match)) {
>>>> +				sv_matches = svdup_u16(1);
>>>> +				sv_matches = svlsl_u16_z(match, sv_matches, shift);
>>>> +				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
>>>> +			}
>>>> +		} else {
>>>> +			do {
>>>> +				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
>>>> +				uint16_t lower_half = 0;
>>>> +				uint16_t upper_half = 0;
>>>> +				/* Compare all signatures in the primary bucket */
>>>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>>>> +							&prim_bucket_sigs[i]));
>>>> +				if (svptest_any(svptrue_b16(), match)) {
>>>> +					sv_matches = svdup_u16(1);
>>>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>>>> +					lower_half = svorv_u16(svptrue_b16(), sv_matches);
>>>> +				}
>>>> +				/* Compare all signatures in the secondary bucket */
>>>> +				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
>>>> +							&sec_bucket_sigs[i]));
>>>> +				if (svptest_any(svptrue_b16(), match)) {
>>>> +					sv_matches = svdup_u16(1);
>>>> +					sv_matches = svlsl_u16_z(match, sv_matches, shift);
>>>> +					upper_half = svorv_u16(svptrue_b16(), sv_matches)
>>>> +						<< RTE_HASH_BUCKET_ENTRIES;
>>>> +				}
>>>> +				hitmask_buffer[i/8] = upper_half | lower_half;
>>>> +				i += vl;
>>>> +			} while (i < RTE_HASH_BUCKET_ENTRIES);
>>>> +		}
>>>>    		}
>>>>    		break;
>>>> +#endif
>>>>    	default:
>>>>    		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>>>> -			*prim_hash_matches |=
>>>> -				((sig == prim_bkt->sig_current[i]) << i);
>>>> -			*sec_hash_matches |=
>>>> -				((sig == sec_bkt->sig_current[i]) << i);
>>>> +			*hitmask_buffer |=
>>>> +				((sig == prim_bucket_sigs[i]) << i);
>>>> +			*hitmask_buffer |=
>>>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>>>>    		}
>>>>    	}
>>>>    }
>>>> @@ -1908,7 +1977,7 @@ compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matche
>>>>
>>>>    	/* For match mask the first bit of every two bits indicates the match */
>>>>    	switch (sig_cmp_fn) {
>>>> -#if defined(__SSE2__)
>>>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>>>>    	case RTE_HASH_COMPARE_SSE:
>>>>    		/* Compare all signatures in the bucket */
>>>>    		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>>>> @@ -1948,14 +2017,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    	uint64_t hits = 0;
>>>>    	int32_t i;
>>>>    	int32_t ret;
>>>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>>>
>>>>    #if defined(__ARM_NEON)
>>>>    	const int hitmask_padding = 0;
>>>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +
>>>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>>>> +	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
>>>>    #else
>>>>    	const int hitmask_padding = 1;
>>>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    #endif
>>>>
>>>>    	__hash_rw_reader_lock(h);
>>>> @@ -1963,18 +2036,24 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    	/* Compare signatures and prefetch key slot of first hit */
>>>>    	for (i = 0; i < num_keys; i++) {
>>>>    #if defined(__ARM_NEON)
>>>> -		compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>>>> -			primary_bkt[i], secondary_bkt[i],
>>>> +		uint16_t *hitmask = &hitmask_buffer[i];
>>>> +		compare_signatures_dense(hitmask,
>>>> +			primary_bkt[i]->sig_current,
>>>> +			secondary_bkt[i]->sig_current,
>>>>    			sig[i], h->sig_cmp_fn);
>>>> +		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>>    #else
>>>> -		compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>>>> +		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>>>    			primary_bkt[i], secondary_bkt[i],
>>>>    			sig[i], h->sig_cmp_fn);
>>>> +		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>>    #endif
>>>>
>>>> -		if (prim_hitmask[i]) {
>>>> +		if (prim_hitmask) {
>>>>    			uint32_t first_hit =
>>>> -					rte_ctz32(prim_hitmask[i])
>>>> +					rte_ctz32(prim_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				primary_bkt[i]->key_idx[first_hit];
>>>> @@ -1986,9 +2065,9 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    			continue;
>>>>    		}
>>>>
>>>> -		if (sec_hitmask[i]) {
>>>> +		if (sec_hitmask) {
>>>>    			uint32_t first_hit =
>>>> -					rte_ctz32(sec_hitmask[i])
>>>> +					rte_ctz32(sec_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				secondary_bkt[i]->key_idx[first_hit];
>>>> @@ -2003,9 +2082,17 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    	/* Compare keys, first hits in primary first */
>>>>    	for (i = 0; i < num_keys; i++) {
>>>>    		positions[i] = -ENOENT;
>>>> -		while (prim_hitmask[i]) {
>>>> +#if defined(__ARM_NEON)
>>>> +		uint16_t *hitmask = &hitmask_buffer[i];
>>>> +		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>> +#else
>>>> +		unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +		unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>> +#endif
>>>> +		while (prim_hitmask) {
>>>>    			uint32_t hit_index =
>>>> -					rte_ctz32(prim_hitmask[i])
>>>> +					rte_ctz32(prim_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				primary_bkt[i]->key_idx[hit_index];
>>>> @@ -2028,12 +2115,12 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    				positions[i] = key_idx - 1;
>>>>    				goto next_key;
>>>>    			}
>>>> -			prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    		}
>>>>
>>>> -		while (sec_hitmask[i]) {
>>>> +		while (sec_hitmask) {
>>>>    			uint32_t hit_index =
>>>> -					rte_ctz32(sec_hitmask[i])
>>>> +					rte_ctz32(sec_hitmask)
>>>>    					>> hitmask_padding;
>>>>    			uint32_t key_idx =
>>>>    				secondary_bkt[i]->key_idx[hit_index];
>>>> @@ -2057,7 +2144,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
>>>>    				positions[i] = key_idx - 1;
>>>>    				goto next_key;
>>>>    			}
>>>> -			sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    		}
>>>>    next_key:
>>>>    		continue;
>>>> @@ -2107,15 +2194,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    	uint64_t hits = 0;
>>>>    	int32_t i;
>>>>    	int32_t ret;
>>>> -	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> -	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    	struct rte_hash_bucket *cur_bkt, *next_bkt;
>>>>    	uint32_t cnt_b, cnt_a;
>>>>
>>>>    #if defined(__ARM_NEON)
>>>>    	const int hitmask_padding = 0;
>>>> +	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
>>>> +	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
>>>>    #else
>>>>    	const int hitmask_padding = 1;
>>>> +	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>> +	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>>>>    #endif
>>>>
>>>>    	for (i = 0; i < num_keys; i++)
>>>> @@ -2132,18 +2222,24 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    		/* Compare signatures and prefetch key slot of first hit */
>>>>    		for (i = 0; i < num_keys; i++) {
>>>>    #if defined(__ARM_NEON)
>>>> -			compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i],
>>>> -				primary_bkt[i], secondary_bkt[i],
>>>> +			uint16_t *hitmask = &hitmask_buffer[i];
>>>> +			compare_signatures_dense(hitmask,
>>>> +				primary_bkt[i]->sig_current,
>>>> +				secondary_bkt[i]->sig_current,
>>>>    				sig[i], h->sig_cmp_fn);
>>>> +			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>>    #else
>>>> -			compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i],
>>>> +			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
>>>>    				primary_bkt[i], secondary_bkt[i],
>>>>    				sig[i], h->sig_cmp_fn);
>>>> +			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>>    #endif
>>>>
>>>> -			if (prim_hitmask[i]) {
>>>> +			if (prim_hitmask) {
>>>>    				uint32_t first_hit =
>>>> -						rte_ctz32(prim_hitmask[i])
>>>> +						rte_ctz32(prim_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    					primary_bkt[i]->key_idx[first_hit];
>>>> @@ -2155,9 +2251,9 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    				continue;
>>>>    			}
>>>>
>>>> -			if (sec_hitmask[i]) {
>>>> +			if (sec_hitmask) {
>>>>    				uint32_t first_hit =
>>>> -						rte_ctz32(sec_hitmask[i])
>>>> +						rte_ctz32(sec_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    					secondary_bkt[i]->key_idx[first_hit];
>>>> @@ -2171,9 +2267,17 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>
>>>>    		/* Compare keys, first hits in primary first */
>>>>    		for (i = 0; i < num_keys; i++) {
>>>> -			while (prim_hitmask[i]) {
>>>> +#if defined(__ARM_NEON)
>>>> +			uint16_t *hitmask = &hitmask_buffer[i];
>>>> +			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
>>>> +			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
>>>> +#else
>>>> +			unsigned int prim_hitmask = prim_hitmask_buffer[i];
>>>> +			unsigned int sec_hitmask = sec_hitmask_buffer[i];
>>>> +#endif
>>>> +			while (prim_hitmask) {
>>>>    				uint32_t hit_index =
>>>> -						rte_ctz32(prim_hitmask[i])
>>>> +						rte_ctz32(prim_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    				rte_atomic_load_explicit(
>>>> @@ -2200,12 +2304,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    					positions[i] = key_idx - 1;
>>>>    					goto next_key;
>>>>    				}
>>>> -				prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    			}
>>>>
>>>> -			while (sec_hitmask[i]) {
>>>> +			while (sec_hitmask) {
>>>>    				uint32_t hit_index =
>>>> -						rte_ctz32(sec_hitmask[i])
>>>> +						rte_ctz32(sec_hitmask)
>>>>    						>> hitmask_padding;
>>>>    				uint32_t key_idx =
>>>>    				rte_atomic_load_explicit(
>>>> @@ -2233,7 +2337,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
>>>>    					positions[i] = key_idx - 1;
>>>>    					goto next_key;
>>>>    				}
>>>> -				sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding));
>>>> +				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
>>>>    			}
>>>>    next_key:
>>>>    			continue;
>>>> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
>>>> index 8ea793c66e..ed18e1f41e 100644
>>>> --- a/lib/hash/rte_cuckoo_hash.h
>>>> +++ b/lib/hash/rte_cuckoo_hash.h
>>>> @@ -137,6 +137,7 @@ enum rte_hash_sig_compare_function {
>>>>    	RTE_HASH_COMPARE_SCALAR = 0,
>>>>    	RTE_HASH_COMPARE_SSE,
>>>>    	RTE_HASH_COMPARE_NEON,
>>>> +	RTE_HASH_COMPARE_SVE,
>>>>    	RTE_HASH_COMPARE_NUM
>>>>    };
>>>>
>>>> --
>>>> 2.34.1
>>>
> 
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
 
 
 
 
- * [PATCH v6 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (4 preceding siblings ...)
  2024-02-27 17:41 ` [PATCH v5 0/4] " Yoan Picchi
@ 2024-03-11 23:21 ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
                   ` (5 subsequent siblings)
  11 siblings, 4 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 197 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 392 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 195 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 6 files changed, 258 insertions(+), 92 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..1af6ba8190
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..dcf9444032
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..e41f03270a 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -442,8 +450,9 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
@@ -1857,63 +1866,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1876,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1924,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1941,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1974,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2003,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2053,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2080,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2110,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2126,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2163,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2196,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v6 2/4] hash: optimize compare signature for NEON
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 1af6ba8190..b5a457f936 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v6 4/4] hash: add SVE support for bulk key lookup
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-03-11 23:21   ` [PATCH v6 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-03-11 23:21   ` Yoan Picchi
  2024-03-12  3:57     ` fengchengwen
  3 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-03-11 23:21 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- Added Defines in code for SVE code support.
- Optimise NEON code
- New SVE code is ~5% slower than optimized NEON for N2 processor.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  2 +
 2 files changed, 60 insertions(+)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index b5a457f936..8a0627e119 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index e41f03270a..7a474267f0 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -452,6 +452,8 @@ rte_hash_create(const struct rte_hash_parameters *params)
 #elif defined(RTE_ARCH_ARM64)
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
 	}
 	else
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v6 4/4] hash: add SVE support for bulk key lookup
  2024-03-11 23:21   ` [PATCH v6 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-03-12  3:57     ` fengchengwen
  2024-03-12 15:08       ` Yoan Picchi
  0 siblings, 1 reply; 73+ messages in thread
From: fengchengwen @ 2024-03-12  3:57 UTC (permalink / raw)
  To: Yoan Picchi, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
Hi Yoan,
On 2024/3/12 7:21, Yoan Picchi wrote:
> - Implemented SVE code for comparing signatures in bulk lookup.
> - Added Defines in code for SVE code support.
> - Optimise NEON code
This commit does not include this part. Pls only describe the content in this commit.
> - New SVE code is ~5% slower than optimized NEON for N2 processor.
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
>  lib/hash/rte_cuckoo_hash.c             |  2 +
>  2 files changed, 60 insertions(+)
> 
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> index b5a457f936..8a0627e119 100644
> --- a/lib/hash/arch/arm/compare_signatures.h
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
>  		*hitmask_buffer = vaddvq_u16(hit2);
>  		}
>  		break;
> +#endif
> +#if defined(RTE_HAS_SVE_ACLE)
> +	case RTE_HASH_COMPARE_SVE: {
...
>  #endif
>  	default:
>  		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> index e41f03270a..7a474267f0 100644
> --- a/lib/hash/rte_cuckoo_hash.c
> +++ b/lib/hash/rte_cuckoo_hash.c
> @@ -452,6 +452,8 @@ rte_hash_create(const struct rte_hash_parameters *params)
>  #elif defined(RTE_ARCH_ARM64)
>  	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>  		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
The RTE_HASH_COMPARE_SVE was defined in "PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup",
but its first use is in this commit, so I think it should defined in this commit.
If RTE_CPUFLAG_SVE and RTE_HAS_SVE_ACLE both set, then SVE impl will be chosen.
If RTE_CPUFLAG_SVE defined, but RTE_HAS_SVE_ACLE was not, then scalar will be chosen. --- in this case we could back to NEON impl.
So I suggest direct use "#if defined(RTE_HAS_SVE_ACLE)" here.
>  	}
>  	else
>  #endif
> 
Plus:
I notice the commit log said the SVE performance is slower than NEON.
And I also notice other platform SVE also lower than NEON,
1. b4ee9c07bd config/arm: disable SVE ACLE for CN10K
2. 4eea7c6461 config/arm: add SVE ACLE control flag
So maybe we should disable RTE_HAS_SVE_ACLE default by:
diff --git a/config/arm/meson.build b/config/arm/meson.build
index 9d6fb87d7f..a5b890d100 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -875,7 +875,7 @@ endif
 if cc.get_define('__ARM_FEATURE_SVE', args: machine_args) != ''
     compile_time_cpuflags += ['RTE_CPUFLAG_SVE']
-    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', true))
+    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', false))
         dpdk_conf.set('RTE_HAS_SVE_ACLE', 1)
     endif
 endif
If the platform verify SVE has higher performance, then it could enable SVE by add "sve_acle: true" in soc_xxx config.
Thanks
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v6 4/4] hash: add SVE support for bulk key lookup
  2024-03-12  3:57     ` fengchengwen
@ 2024-03-12 15:08       ` Yoan Picchi
  0 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:08 UTC (permalink / raw)
  To: fengchengwen, Yoan Picchi, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
On 3/12/24 03:57, fengchengwen wrote:
> Hi Yoan,
> 
> On 2024/3/12 7:21, Yoan Picchi wrote:
>> - Implemented SVE code for comparing signatures in bulk lookup.
>> - Added Defines in code for SVE code support.
>> - Optimise NEON code
> 
> This commit does not include this part. Pls only describe the content in this commit.
Thank you. I forgot to edit that out after moving commit around.
> 
>> - New SVE code is ~5% slower than optimized NEON for N2 processor.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>>   lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
>>   lib/hash/rte_cuckoo_hash.c             |  2 +
>>   2 files changed, 60 insertions(+)
>>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> index b5a457f936..8a0627e119 100644
>> --- a/lib/hash/arch/arm/compare_signatures.h
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
>>   		*hitmask_buffer = vaddvq_u16(hit2);
>>   		}
>>   		break;
>> +#endif
>> +#if defined(RTE_HAS_SVE_ACLE)
>> +	case RTE_HASH_COMPARE_SVE: {
> 
> ...
> 
>>   #endif
>>   	default:
>>   		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
>> index e41f03270a..7a474267f0 100644
>> --- a/lib/hash/rte_cuckoo_hash.c
>> +++ b/lib/hash/rte_cuckoo_hash.c
>> @@ -452,6 +452,8 @@ rte_hash_create(const struct rte_hash_parameters *params)
>>   #elif defined(RTE_ARCH_ARM64)
>>   	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>>   		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
>> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
>> +			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> 
> The RTE_HASH_COMPARE_SVE was defined in "PATCH v6 1/4] hash: pack the hitmask for hash in bulk lookup",
> but its first use is in this commit, so I think it should defined in this commit.
> 
> If RTE_CPUFLAG_SVE and RTE_HAS_SVE_ACLE both set, then SVE impl will be chosen.
> If RTE_CPUFLAG_SVE defined, but RTE_HAS_SVE_ACLE was not, then scalar will be chosen. --- in this case we could back to NEON impl.
> So I suggest direct use "#if defined(RTE_HAS_SVE_ACLE)" here.
Sounds fair. I'll do it.
> 
>>   	}
>>   	else
>>   #endif
>>
> 
> Plus:
> I notice the commit log said the SVE performance is slower than NEON.
> 
> And I also notice other platform SVE also lower than NEON,
> 1. b4ee9c07bd config/arm: disable SVE ACLE for CN10K
> 2. 4eea7c6461 config/arm: add SVE ACLE control flag
> 
> So maybe we should disable RTE_HAS_SVE_ACLE default by:
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 9d6fb87d7f..a5b890d100 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -875,7 +875,7 @@ endif
> 
>   if cc.get_define('__ARM_FEATURE_SVE', args: machine_args) != ''
>       compile_time_cpuflags += ['RTE_CPUFLAG_SVE']
> -    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', true))
> +    if (cc.check_header('arm_sve.h') and soc_config.get('sve_acle', false))
>           dpdk_conf.set('RTE_HAS_SVE_ACLE', 1)
>       endif
>   endif
> 
> If the platform verify SVE has higher performance, then it could enable SVE by add "sve_acle: true" in soc_xxx config.
> 
> Thanks
Here I kinda disagree. In this particular instance, SVE is a bit slower 
with narrow vectors (128b), but could be faster with some wider vector 
sizes.
Even in general SVE 128b is not just slower than neon. It's a case by 
case basis. Sometime it's slower, sometime it's faster, so I don't think 
we should just disable it by default. In any case, disabling it should 
be its own patch with much discussion, not just a offhand thing we 
include in the middle of this patch.
This SVE version is still faster than the upstream neon version. I just 
happen to have improved the neon version even more.
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
 
 
- * [PATCH v7 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (5 preceding siblings ...)
  2024-03-11 23:21 ` [PATCH v6 0/4] " Yoan Picchi
@ 2024-03-12 15:42 ` Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
                   ` (4 subsequent siblings)
  11 siblings, 4 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 394 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  2024-03-19 10:41     ` Konstantin Ananyev
  2024-03-19 16:09     ` Stephen Hemminger
  2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 255 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..1af6ba8190
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..dcf9444032
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * RE: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-03-19 10:41     ` Konstantin Ananyev
  2024-03-19 13:09       ` Yoan Picchi
  2024-03-19 16:09     ` Stephen Hemminger
  1 sibling, 1 reply; 73+ messages in thread
From: Konstantin Ananyev @ 2024-03-19 10:41 UTC (permalink / raw)
  To: Yoan Picchi, Thomas Monjalon, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown
Hi,
> Current hitmask includes padding due to Intel's SIMD
> implementation detail. This patch allows non Intel SIMD
> implementations to benefit from a dense hitmask.
> In addition, the new dense hitmask interweave the primary
> and secondary matches which allow a better cache usage and
> enable future improvements for the SIMD implementations
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> ---
>  .mailmap                                  |   2 +
>  lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>  lib/hash/arch/common/compare_signatures.h |  38 +++++
>  lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>  lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>  5 files changed, 255 insertions(+), 91 deletions(-)
>  create mode 100644 lib/hash/arch/arm/compare_signatures.h
>  create mode 100644 lib/hash/arch/common/compare_signatures.h
>  create mode 100644 lib/hash/arch/x86/compare_signatures.h
> 
> diff --git a/.mailmap b/.mailmap
> index 66ebc20666..00b50414d3 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>  Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>  Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>  Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> +Harjot Singh <harjot.singh@arm.com>
>  Harman Kalra <hkalra@marvell.com>
>  Harneet Singh <harneet.singh@intel.com>
>  Harold Huang <baymaxhuang@gmail.com>
> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>  Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>  Yi Zhang <zhang.yi75@zte.com.cn>
>  Yoann Desmouceaux <ydesmouc@cisco.com>
> +Yoan Picchi <yoan.picchi@arm.com>
>  Yogesh Jangra <yogesh.jangra@intel.com>
>  Yogev Chaimovich <yogev@cgstowernetworks.com>
>  Yongjie Gu <yongjiex.gu@intel.com>
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> new file mode 100644
> index 0000000000..1af6ba8190
> --- /dev/null
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -0,0 +1,61 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * Arm's version uses a densely packed hitmask buffer:
> + * Every bit is in use.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
> +	/* For match mask every bits indicates the match */
> +	switch (sig_cmp_fn) {
> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> +	case RTE_HASH_COMPARE_NEON: {
> +		uint16x8_t vmat, vsig, x;
> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> +		uint16_t low, high;
> +
> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
> +		/* Compare all signatures in the primary bucket */
> +		vmat = vceqq_u16(vsig,
> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +		low = (uint16_t)(vaddvq_u16(x));
> +		/* Compare all signatures in the secondary bucket */
> +		vmat = vceqq_u16(vsig,
> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +		high = (uint16_t)(vaddvq_u16(x));
> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +
> +		}
> +		break;
> +#endif
> +	default:
> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			*hitmask_buffer |=
> +				((sig == prim_bucket_sigs[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +		}
> +	}
> +}
> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> new file mode 100644
> index 0000000000..dcf9444032
> --- /dev/null
> +++ b/lib/hash/arch/common/compare_signatures.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> + * but the dense one is slightly faster.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	(void) sig_cmp_fn;
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
> +	/* For match mask every bits indicates the match */
> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +		*hitmask_buffer |=
> +			((sig == prim_bucket_sigs[i]) << i);
> +		*hitmask_buffer |=
> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +	}
> +
> +}
Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
One question I have - does it mean that now for x86 we always use 'sparse' while for all other
ARM and non-ARM platforms we switch to 'dense'?
> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
> new file mode 100644
> index 0000000000..7eec499e1f
> --- /dev/null
> +++ b/lib/hash/arch/x86/compare_signatures.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * x86's version uses a sparsely packed hitmask buffer:
> + * Every other bit is padding.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 0
> +
> +static inline void
> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> +			const struct rte_hash_bucket *prim_bkt,
> +			const struct rte_hash_bucket *sec_bkt,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	/* For match mask the first bit of every two bits indicates the match */
> +	switch (sig_cmp_fn) {
> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> +	case RTE_HASH_COMPARE_SSE:
> +		/* Compare all signatures in the bucket */
> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> +				_mm_load_si128(
> +					(__m128i const *)prim_bkt->sig_current),
> +				_mm_set1_epi16(sig)));
> +		/* Extract the even-index bits only */
> +		*prim_hash_matches &= 0x5555;
> +		/* Compare all signatures in the bucket */
> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> +				_mm_load_si128(
> +					(__m128i const *)sec_bkt->sig_current),
> +				_mm_set1_epi16(sig)));
> +		/* Extract the even-index bits only */
> +		*sec_hash_matches &= 0x5555;
> +		break;
> +#endif /* defined(__SSE2__) */
> +	default:
> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			*prim_hash_matches |=
> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
> +			*sec_hash_matches |=
> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
> +		}
> +	}
> +}
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-19 10:41     ` Konstantin Ananyev
@ 2024-03-19 13:09       ` Yoan Picchi
  2024-03-19 13:25         ` Konstantin Ananyev
  0 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-03-19 13:09 UTC (permalink / raw)
  To: Konstantin Ananyev, Yoan Picchi, Thomas Monjalon, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown
On 3/19/24 10:41, Konstantin Ananyev wrote:
> 
> Hi,
> 
>> Current hitmask includes padding due to Intel's SIMD
>> implementation detail. This patch allows non Intel SIMD
>> implementations to benefit from a dense hitmask.
>> In addition, the new dense hitmask interweave the primary
>> and secondary matches which allow a better cache usage and
>> enable future improvements for the SIMD implementations
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> ---
>>   .mailmap                                  |   2 +
>>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>>   lib/hash/arch/common/compare_signatures.h |  38 +++++
>>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>>   5 files changed, 255 insertions(+), 91 deletions(-)
>>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
>>   create mode 100644 lib/hash/arch/common/compare_signatures.h
>>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
>>
>> diff --git a/.mailmap b/.mailmap
>> index 66ebc20666..00b50414d3 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>>   Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>>   Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>>   Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
>> +Harjot Singh <harjot.singh@arm.com>
>>   Harman Kalra <hkalra@marvell.com>
>>   Harneet Singh <harneet.singh@intel.com>
>>   Harold Huang <baymaxhuang@gmail.com>
>> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>>   Yi Zhang <zhang.yi75@zte.com.cn>
>>   Yoann Desmouceaux <ydesmouc@cisco.com>
>> +Yoan Picchi <yoan.picchi@arm.com>
>>   Yogesh Jangra <yogesh.jangra@intel.com>
>>   Yogev Chaimovich <yogev@cgstowernetworks.com>
>>   Yongjie Gu <yongjiex.gu@intel.com>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> new file mode 100644
>> index 0000000000..1af6ba8190
>> --- /dev/null
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -0,0 +1,61 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * Arm's version uses a densely packed hitmask buffer:
>> + * Every bit is in use.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_NEON: {
>> +		uint16x8_t vmat, vsig, x;
>> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> +		uint16_t low, high;
>> +
>> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>> +		/* Compare all signatures in the primary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		low = (uint16_t)(vaddvq_u16(x));
>> +		/* Compare all signatures in the secondary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		high = (uint16_t)(vaddvq_u16(x));
>> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +
>> +		}
>> +		break;
>> +#endif
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*hitmask_buffer |=
>> +				((sig == prim_bucket_sigs[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +		}
>> +	}
>> +}
>> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
>> new file mode 100644
>> index 0000000000..dcf9444032
>> --- /dev/null
>> +++ b/lib/hash/arch/common/compare_signatures.h
>> @@ -0,0 +1,38 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * The generic version could use either a dense or sparsely packed hitmask buffer,
>> + * but the dense one is slightly faster.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	(void) sig_cmp_fn;
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +		*hitmask_buffer |=
>> +			((sig == prim_bucket_sigs[i]) << i);
>> +		*hitmask_buffer |=
>> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +	}
>> +
>> +}
> 
> Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> ARM and non-ARM platforms we switch to 'dense'?
Yes it does. x86 support only the sparse method (the legacy one). Arm 
and generic code could support both dense and sparse. The reason I made 
them use the dense method is because it was slightly faster in my tests. 
(no need to add padding and shifts amongst other benefit.)
> 
>> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
>> new file mode 100644
>> index 0000000000..7eec499e1f
>> --- /dev/null
>> +++ b/lib/hash/arch/x86/compare_signatures.h
>> @@ -0,0 +1,53 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * x86's version uses a sparsely packed hitmask buffer:
>> + * Every other bit is padding.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 0
>> +
>> +static inline void
>> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> +			const struct rte_hash_bucket *prim_bkt,
>> +			const struct rte_hash_bucket *sec_bkt,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	/* For match mask the first bit of every two bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_SSE:
>> +		/* Compare all signatures in the bucket */
>> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)prim_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*prim_hash_matches &= 0x5555;
>> +		/* Compare all signatures in the bucket */
>> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)sec_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*sec_hash_matches &= 0x5555;
>> +		break;
>> +#endif /* defined(__SSE2__) */
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*prim_hash_matches |=
>> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
>> +			*sec_hash_matches |=
>> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
>> +		}
>> +	}
>> +}
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * RE: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-19 13:09       ` Yoan Picchi
@ 2024-03-19 13:25         ` Konstantin Ananyev
  0 siblings, 0 replies; 73+ messages in thread
From: Konstantin Ananyev @ 2024-03-19 13:25 UTC (permalink / raw)
  To: Yoan Picchi, Yoan Picchi, Thomas Monjalon, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown
> >
> > Hi,
> >
> >> Current hitmask includes padding due to Intel's SIMD
> >> implementation detail. This patch allows non Intel SIMD
> >> implementations to benefit from a dense hitmask.
> >> In addition, the new dense hitmask interweave the primary
> >> and secondary matches which allow a better cache usage and
> >> enable future improvements for the SIMD implementations
> >>
> >> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> >> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> >> ---
> >>   .mailmap                                  |   2 +
> >>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
> >>   lib/hash/arch/common/compare_signatures.h |  38 +++++
> >>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
> >>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
> >>   5 files changed, 255 insertions(+), 91 deletions(-)
> >>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
> >>   create mode 100644 lib/hash/arch/common/compare_signatures.h
> >>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
> >>
> >> diff --git a/.mailmap b/.mailmap
> >> index 66ebc20666..00b50414d3 100644
> >> --- a/.mailmap
> >> +++ b/.mailmap
> >> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
> >>   Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
> >>   Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
> >>   Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> >> +Harjot Singh <harjot.singh@arm.com>
> >>   Harman Kalra <hkalra@marvell.com>
> >>   Harneet Singh <harneet.singh@intel.com>
> >>   Harold Huang <baymaxhuang@gmail.com>
> >> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
> >>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
> >>   Yi Zhang <zhang.yi75@zte.com.cn>
> >>   Yoann Desmouceaux <ydesmouc@cisco.com>
> >> +Yoan Picchi <yoan.picchi@arm.com>
> >>   Yogesh Jangra <yogesh.jangra@intel.com>
> >>   Yogev Chaimovich <yogev@cgstowernetworks.com>
> >>   Yongjie Gu <yongjiex.gu@intel.com>
> >> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..1af6ba8190
> >> --- /dev/null
> >> +++ b/lib/hash/arch/arm/compare_signatures.h
> >> @@ -0,0 +1,61 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * Arm's version uses a densely packed hitmask buffer:
> >> + * Every bit is in use.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 1
> >> +
> >> +static inline void
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >> +	/* For match mask every bits indicates the match */
> >> +	switch (sig_cmp_fn) {
> >> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> >> +	case RTE_HASH_COMPARE_NEON: {
> >> +		uint16x8_t vmat, vsig, x;
> >> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> >> +		uint16_t low, high;
> >> +
> >> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
> >> +		/* Compare all signatures in the primary bucket */
> >> +		vmat = vceqq_u16(vsig,
> >> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> >> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +		low = (uint16_t)(vaddvq_u16(x));
> >> +		/* Compare all signatures in the secondary bucket */
> >> +		vmat = vceqq_u16(vsig,
> >> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> >> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +		high = (uint16_t)(vaddvq_u16(x));
> >> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> >> +
> >> +		}
> >> +		break;
> >> +#endif
> >> +	default:
> >> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +			*hitmask_buffer |=
> >> +				((sig == prim_bucket_sigs[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +		}
> >> +	}
> >> +}
> >> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..dcf9444032
> >> --- /dev/null
> >> +++ b/lib/hash/arch/common/compare_signatures.h
> >> @@ -0,0 +1,38 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> >> + * but the dense one is slightly faster.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 1
> >> +
> >> +static inline void
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +	(void) sig_cmp_fn;
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >> +	/* For match mask every bits indicates the match */
> >> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +		*hitmask_buffer |=
> >> +			((sig == prim_bucket_sigs[i]) << i);
> >> +		*hitmask_buffer |=
> >> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +	}
> >> +
> >> +}
> >
> > Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> > One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> > ARM and non-ARM platforms we switch to 'dense'?
> 
> Yes it does. x86 support only the sparse method (the legacy one). Arm
> and generic code could support both dense and sparse. The reason I made
> them use the dense method is because it was slightly faster in my tests.
Ok, but before that, a 'generic' one (non-x86 and non-ARM) used 'sparse' one, correct?
If so, then probably need to outline it a bit more in patch comments and might be even release notes.
At least that would be my expectations, probably hash lib maintainers need to say what is the best way here.
The code refactoring itself - LGTM.
> (no need to add padding and shifts amongst other benefit.)
> 
> >
> >> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..7eec499e1f
> >> --- /dev/null
> >> +++ b/lib/hash/arch/x86/compare_signatures.h
> >> @@ -0,0 +1,53 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * x86's version uses a sparsely packed hitmask buffer:
> >> + * Every other bit is padding.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 0
> >> +
> >> +static inline void
> >> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> >> +			const struct rte_hash_bucket *prim_bkt,
> >> +			const struct rte_hash_bucket *sec_bkt,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +	/* For match mask the first bit of every two bits indicates the match */
> >> +	switch (sig_cmp_fn) {
> >> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> >> +	case RTE_HASH_COMPARE_SSE:
> >> +		/* Compare all signatures in the bucket */
> >> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> +				_mm_load_si128(
> >> +					(__m128i const *)prim_bkt->sig_current),
> >> +				_mm_set1_epi16(sig)));
> >> +		/* Extract the even-index bits only */
> >> +		*prim_hash_matches &= 0x5555;
> >> +		/* Compare all signatures in the bucket */
> >> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> +				_mm_load_si128(
> >> +					(__m128i const *)sec_bkt->sig_current),
> >> +				_mm_set1_epi16(sig)));
> >> +		/* Extract the even-index bits only */
> >> +		*sec_hash_matches &= 0x5555;
> >> +		break;
> >> +#endif /* defined(__SSE2__) */
> >> +	default:
> >> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +			*prim_hash_matches |=
> >> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
> >> +			*sec_hash_matches |=
> >> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
> >> +		}
> >> +	}
> >> +}
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
 
- * Re: [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-03-19 10:41     ` Konstantin Ananyev
@ 2024-03-19 16:09     ` Stephen Hemminger
  1 sibling, 0 replies; 73+ messages in thread
From: Stephen Hemminger @ 2024-03-19 16:09 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown
On Tue, 12 Mar 2024 15:42:12 +0000
Yoan Picchi <yoan.picchi@arm.com> wrote:
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
Space around math operations please.
^ permalink raw reply	[flat|nested] 73+ messages in thread 
 
- * [PATCH v7 2/4] hash: optimize compare signature for NEON
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  2024-03-20  7:37     ` [EXTERNAL] " Pavan Nikhilesh Bhagavatula
  2024-03-12 15:42   ` [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 1af6ba8190..b5a457f936 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * RE: [EXTERNAL] [PATCH v7 2/4] hash: optimize compare signature for NEON
  2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-03-20  7:37     ` Pavan Nikhilesh Bhagavatula
  2024-04-11 13:32       ` Yoan Picchi
  0 siblings, 1 reply; 73+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2024-03-20  7:37 UTC (permalink / raw)
  To: Yoan Picchi, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown, Jerin Jacob
> Upon a successful comparison, NEON sets all the bits in the lane to 1
> We can skip shifting by simply masking with specific masks.
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> ---
>  lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
>  1 file changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/hash/arch/arm/compare_signatures.h
> b/lib/hash/arch/arm/compare_signatures.h
> index 1af6ba8190..b5a457f936 100644
> --- a/lib/hash/arch/arm/compare_signatures.h
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t
> *hitmask_buffer,
>  	switch (sig_cmp_fn) {
>  #if RTE_HASH_BUCKET_ENTRIES <= 8
>  	case RTE_HASH_COMPARE_NEON: {
> -		uint16x8_t vmat, vsig, x;
> -		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> -		uint16_t low, high;
> +		uint16x8_t vmat, hit1, hit2;
> +		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20,
> 0x40, 0x80};
> +		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const
> *)&sig);
> 
> -		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>  		/* Compare all signatures in the primary bucket */
> -		vmat = vceqq_u16(vsig,
> -			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
> shift);
> -		low = (uint16_t)(vaddvq_u16(x));
> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
> +		hit1 = vandq_u16(vmat, mask);
> +
>  		/* Compare all signatures in the secondary bucket */
> -		vmat = vceqq_u16(vsig,
> -			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
> shift);
> -		high = (uint16_t)(vaddvq_u16(x));
> -		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
> +		hit2 = vandq_u16(vmat, mask);
> 
> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
> +		hit2 = vorrq_u16(hit1, hit2);
> +		*hitmask_buffer = vaddvq_u16(hit2);
Since vaddv is expensive could you convert it to vshrn?
https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_neon.h#L226
>  		}
>  		break;
>  #endif
> --
> 2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [EXTERNAL] [PATCH v7 2/4] hash: optimize compare signature for NEON
  2024-03-20  7:37     ` [EXTERNAL] " Pavan Nikhilesh Bhagavatula
@ 2024-04-11 13:32       ` Yoan Picchi
  0 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-11 13:32 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Yoan Picchi, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Ruifeng Wang, Nathan Brown, Jerin Jacob
On 3/20/24 07:37, Pavan Nikhilesh Bhagavatula wrote:
>> Upon a successful comparison, NEON sets all the bits in the lane to 1
>> We can skip shifting by simply masking with specific masks.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> ---
>>   lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
>>   1 file changed, 11 insertions(+), 13 deletions(-)
>>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h
>> b/lib/hash/arch/arm/compare_signatures.h
>> index 1af6ba8190..b5a457f936 100644
>> --- a/lib/hash/arch/arm/compare_signatures.h
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t
>> *hitmask_buffer,
>>   	switch (sig_cmp_fn) {
>>   #if RTE_HASH_BUCKET_ENTRIES <= 8
>>   	case RTE_HASH_COMPARE_NEON: {
>> -		uint16x8_t vmat, vsig, x;
>> -		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> -		uint16_t low, high;
>> +		uint16x8_t vmat, hit1, hit2;
>> +		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20,
>> 0x40, 0x80};
>> +		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const
>> *)&sig);
>>
>> -		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>>   		/* Compare all signatures in the primary bucket */
>> -		vmat = vceqq_u16(vsig,
>> -			vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
>> shift);
>> -		low = (uint16_t)(vaddvq_u16(x));
>> +		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
>> +		hit1 = vandq_u16(vmat, mask);
>> +
>>   		/* Compare all signatures in the secondary bucket */
>> -		vmat = vceqq_u16(vsig,
>> -			vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> -		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)),
>> shift);
>> -		high = (uint16_t)(vaddvq_u16(x));
>> -		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
>> +		hit2 = vandq_u16(vmat, mask);
>>
>> +		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
>> +		hit2 = vorrq_u16(hit1, hit2);
>> +		*hitmask_buffer = vaddvq_u16(hit2);
> 
> Since vaddv is expensive could you convert it to vshrn?
> 
> https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
> 
> https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_neon.h#L226
Thank you for those links, it was a good read.
Unfortunatly I don't think it is a good use case here. A decent part of 
the speedup I get is by using a dense hitmask: ie every bit count with 
no padding. Using the vshrn would have 4 bits of padding, and stripping 
them would be more expensive than using a regular reduce.
> 
>>   		}
>>   		break;
>>   #endif
>> --
>> 2.25.1
> 
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
 
- * [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  2024-03-12 15:42   ` [PATCH v7 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v7 4/4] hash: add SVE support for bulk key lookup
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-03-12 15:42   ` [PATCH v7 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-03-12 15:42   ` Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-03-12 15:42 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  7 +++-
 lib/hash/rte_cuckoo_hash.h             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index b5a457f936..8a0627e119 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparison at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i/8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0697743cdf..75f555ba2c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v8 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (6 preceding siblings ...)
  2024-03-12 15:42 ` [PATCH v7 0/4] " Yoan Picchi
@ 2024-04-17 16:08 ` Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
                   ` (3 subsequent siblings)
  11 siblings, 4 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
V7->V8:
  Commit message
  Typos and missing spaces
Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 394 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  2024-04-17 18:12     ` Stephen Hemminger
  2024-04-17 16:08   ` [PATCH v8 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 255 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..63eb341d0e
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..59157d31e1
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-04-17 18:12     ` Stephen Hemminger
  0 siblings, 0 replies; 73+ messages in thread
From: Stephen Hemminger @ 2024-04-17 18:12 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown
On Wed, 17 Apr 2024 16:08:04 +0000
Yoan Picchi <yoan.picchi@arm.com> wrote:
> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> new file mode 100644
> index 0000000000..59157d31e1
> --- /dev/null
> +++ b/lib/hash/arch/common/compare_signatures.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> + * but the dense one is slightly faster.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	(void) sig_cmp_fn;
Please use __rte_unused attribute for this.
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
The message should be indented like multi-line function args.
If possible shorten the message.
> +
> +	/* For match mask every bits indicates the match */
> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +		*hitmask_buffer |=
> +			((sig == prim_bucket_sigs[i]) << i);
Don't really need () around the whole expression here.
> +		*hitmask_buffer |=
> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +	}
> +
> +}
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v8 2/4] hash: optimize compare signature for NEON
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 63eb341d0e..2601ed68b3 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  2024-04-17 16:08   ` [PATCH v8 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v8 4/4] hash: add SVE support for bulk key lookup
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-04-17 16:08   ` [PATCH v8 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-04-17 16:08   ` Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-17 16:08 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  7 +++-
 lib/hash/rte_cuckoo_hash.h             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 2601ed68b3..140ff97b1d 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0697743cdf..75f555ba2c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v9 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (7 preceding siblings ...)
  2024-04-17 16:08 ` [PATCH v8 0/4] " Yoan Picchi
@ 2024-04-30 16:27 ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (5 more replies)
  2024-07-03 17:13 ` [PATCH v10 " Yoan Picchi
                   ` (2 subsequent siblings)
  11 siblings, 6 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
V7->V8:
  Commit message
  Typos and missing spaces
V8->V9:
  Use __rte_unused instead of (void)
  Fix an indentation mistake
Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
 lib/hash/arch/common/compare_signatures.h |  37 ++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |   1 +
 7 files changed, 393 insertions(+), 115 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-06-14 13:42     ` David Marchand
  2024-04-30 16:27   ` [PATCH v9 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  37 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 254 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..46d15da89f
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+				  "hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				(sig == prim_bucket_sigs[i]) << i;
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..f43b367005
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			__rte_unused enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+				  "hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			(sig == prim_bucket_sigs[i]) << i;
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..db342804e1
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				(sig == prim_bkt->sig_current[i]) << (i << 1);
+			*sec_hash_matches |=
+				(sig == sec_bkt->sig_current[i]) << (i << 1);
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-06-14 13:42     ` David Marchand
  0 siblings, 0 replies; 73+ messages in thread
From: David Marchand @ 2024-06-14 13:42 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown,
	Paul Szczepanek
On Tue, Apr 30, 2024 at 6:28 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
>
> Current hitmask includes padding due to Intel's SIMD
> implementation detail. This patch allows non Intel SIMD
> implementations to benefit from a dense hitmask.
> In addition, the new dense hitmask interweave the primary
> and secondary matches which allow a better cache usage and
> enable future improvements for the SIMD implementations
> The default non SIMD path now use this dense mask.
>
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> ---
>  .mailmap                                  |   2 +
>  lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>  lib/hash/arch/common/compare_signatures.h |  37 +++++
>  lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>  lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>  5 files changed, 254 insertions(+), 91 deletions(-)
>  create mode 100644 lib/hash/arch/arm/compare_signatures.h
>  create mode 100644 lib/hash/arch/common/compare_signatures.h
>  create mode 100644 lib/hash/arch/x86/compare_signatures.h
>
> diff --git a/.mailmap b/.mailmap
> index 66ebc20666..00b50414d3 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>  Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>  Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>  Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> +Harjot Singh <harjot.singh@arm.com>
This should be in patch 3.
>  Harman Kalra <hkalra@marvell.com>
>  Harneet Singh <harneet.singh@intel.com>
>  Harold Huang <baymaxhuang@gmail.com>
> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>  Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>  Yi Zhang <zhang.yi75@zte.com.cn>
>  Yoann Desmouceaux <ydesmouc@cisco.com>
> +Yoan Picchi <yoan.picchi@arm.com>
>  Yogesh Jangra <yogesh.jangra@intel.com>
>  Yogev Chaimovich <yogev@cgstowernetworks.com>
>  Yongjie Gu <yongjiex.gu@intel.com>
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> new file mode 100644
> index 0000000000..46d15da89f
> --- /dev/null
> +++ b/lib/hash/arch/arm/compare_signatures.h
Why create a new directory?
Simple lib/hash/hash_compare_signature_{arm,x86,generic}.h are enough.
> @@ -0,0 +1,61 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * Arm's version uses a densely packed hitmask buffer:
> + * Every bit is in use.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
Please separate headers by groups, like in
https://doc.dpdk.org/guides/contributing/coding_style.html#header-includes
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +                       const uint16_t *prim_bucket_sigs,
> +                       const uint16_t *sec_bucket_sigs,
> +                       uint16_t sig,
> +                       enum rte_hash_sig_compare_function sig_cmp_fn)
Strange indent.
> +{
> +
> +       static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
> +                                 "hitmask_buffer must be wide enough to fit a dense hitmask");
This is similar but less strict than an added check in
rte_cuckoo_hash.c later in this patch.
So I suspect only one of those checks is necessary.
But I don't understand the logic, so for you to figure out :-).
> +
> +       /* For match mask every bits indicates the match */
> +       switch (sig_cmp_fn) {
> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> +       case RTE_HASH_COMPARE_NEON: {
> +               uint16x8_t vmat, vsig, x;
> +               int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> +               uint16_t low, high;
> +
> +               vsig = vld1q_dup_u16((uint16_t const *)&sig);
> +               /* Compare all signatures in the primary bucket */
> +               vmat = vceqq_u16(vsig,
> +                       vld1q_u16((uint16_t const *)prim_bucket_sigs));
General comment for this series.
When possible, keep on the same line up to 100 chars, the code is hard
enough to read with all those vector intrinsics...
> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +               low = (uint16_t)(vaddvq_u16(x));
> +               /* Compare all signatures in the secondary bucket */
> +               vmat = vceqq_u16(vsig,
> +                       vld1q_u16((uint16_t const *)sec_bucket_sigs));
> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +               high = (uint16_t)(vaddvq_u16(x));
> +               *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +
> +               }
> +               break;
> +#endif
> +       default:
> +               for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +                       *hitmask_buffer |=
> +                               (sig == prim_bucket_sigs[i]) << i;
> +                       *hitmask_buffer |=
> +                               ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +               }
> +       }
> +}
[snip]
-- 
David Marchand
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v9 2/4] hash: optimize compare signature for NEON
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 46d15da89f..72bd171484 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -30,23 +30,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-04-30 16:27   ` [PATCH v9 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index d586878a22..4f871b3499 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v9 4/4] hash: add SVE support for bulk key lookup
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-04-30 16:27   ` [PATCH v9 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-04-30 16:27   ` Yoan Picchi
  2024-06-14 13:42     ` David Marchand
  2024-06-14 13:43   ` [PATCH v9 0/4] " David Marchand
  2024-06-27 14:48   ` Thomas Monjalon
  5 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-04-30 16:27 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c             |  7 +++-
 lib/hash/rte_cuckoo_hash.h             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
index 72bd171484..b4b4cf04e9 100644
--- a/lib/hash/arch/arm/compare_signatures.h
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred,
+				primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+							&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 0697743cdf..75f555ba2c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..01ad01c258 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v9 4/4] hash: add SVE support for bulk key lookup
  2024-04-30 16:27   ` [PATCH v9 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-06-14 13:42     ` David Marchand
  0 siblings, 0 replies; 73+ messages in thread
From: David Marchand @ 2024-06-14 13:42 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin,
	dev, nd, Harjot Singh, Nathan Brown, Ruifeng Wang
On Tue, Apr 30, 2024 at 6:28 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
>
> - Implemented SVE code for comparing signatures in bulk lookup.
> - New SVE code is ~5% slower than optimized NEON for N2 processor for
> 128b vectors.
>
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Signed-off-by: Harjot Singh <harjot.singh@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++
>  lib/hash/rte_cuckoo_hash.c             |  7 +++-
>  lib/hash/rte_cuckoo_hash.h             |  1 +
>  3 files changed, 65 insertions(+), 1 deletion(-)
>
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> index 72bd171484..b4b4cf04e9 100644
> --- a/lib/hash/arch/arm/compare_signatures.h
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
>                 *hitmask_buffer = vaddvq_u16(hit2);
>                 }
>                 break;
> +#endif
> +#if defined(RTE_HAS_SVE_ACLE)
> +       case RTE_HASH_COMPARE_SVE: {
> +               svuint16_t vsign, shift, sv_matches;
> +               svbool_t pred, match, bucket_wide_pred;
> +               int i = 0;
> +               uint64_t vl = svcnth();
> +
> +               vsign = svdup_u16(sig);
> +               shift = svindex_u16(0, 1);
> +
> +               if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
> +                       svuint16_t primary_array_vect, secondary_array_vect;
> +                       bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
> +                       primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
> +                       secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
> +
> +                       /* We merged the two vectors so we can do both comparisons at once */
> +                       primary_array_vect = svsplice_u16(bucket_wide_pred,
> +                               primary_array_vect,
> +                               secondary_array_vect);
> +                       pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
> +
> +                       /* Compare all signatures in the buckets */
> +                       match = svcmpeq_u16(pred, vsign, primary_array_vect);
> +                       if (svptest_any(svptrue_b16(), match)) {
> +                               sv_matches = svdup_u16(1);
> +                               sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +                               *hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
> +                       }
> +               } else {
> +                       do {
> +                               pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
> +                               uint16_t lower_half = 0;
> +                               uint16_t upper_half = 0;
> +                               /* Compare all signatures in the primary bucket */
> +                               match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> +                                                       &prim_bucket_sigs[i]));
> +                               if (svptest_any(svptrue_b16(), match)) {
> +                                       sv_matches = svdup_u16(1);
> +                                       sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +                                       lower_half = svorv_u16(svptrue_b16(), sv_matches);
> +                               }
> +                               /* Compare all signatures in the secondary bucket */
> +                               match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
> +                                                       &sec_bucket_sigs[i]));
> +                               if (svptest_any(svptrue_b16(), match)) {
> +                                       sv_matches = svdup_u16(1);
> +                                       sv_matches = svlsl_u16_z(match, sv_matches, shift);
> +                                       upper_half = svorv_u16(svptrue_b16(), sv_matches)
> +                                               << RTE_HASH_BUCKET_ENTRIES;
> +                               }
> +                               hitmask_buffer[i / 8] = upper_half | lower_half;
> +                               i += vl;
> +                       } while (i < RTE_HASH_BUCKET_ENTRIES);
> +               }
> +               }
> +               break;
>  #endif
>         default:
>                 for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
> index 0697743cdf..75f555ba2c 100644
> --- a/lib/hash/rte_cuckoo_hash.c
> +++ b/lib/hash/rte_cuckoo_hash.c
> @@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
>                 h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
>         else
>  #elif defined(RTE_ARCH_ARM64)
> -       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
> +       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
>                 h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
> +#if defined(RTE_HAS_SVE_ACLE)
> +               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
> +                       h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
> +#endif
> +       }
>         else
>  #endif
>                 h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
> index a528f1d1a0..01ad01c258 100644
> --- a/lib/hash/rte_cuckoo_hash.h
> +++ b/lib/hash/rte_cuckoo_hash.h
> @@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function {
>         RTE_HASH_COMPARE_SCALAR = 0,
>         RTE_HASH_COMPARE_SSE,
>         RTE_HASH_COMPARE_NEON,
> +       RTE_HASH_COMPARE_SVE,
>         RTE_HASH_COMPARE_NUM
>  };
I am surprised the ABI check does not complain over this change.
RTE_HASH_COMPARE_NUM is not used and knowing the number of compare
function implementations should not be of interest for an application.
But it still seem an ABI breakage to me.
RTE_HASH_COMPARE_NUM can be removed in v24.11.
And ideally, sig_cmp_fn should be made opaque (or moved to an opaque
struct out of the rte_hash public struct).
-- 
David Marchand
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * Re: [PATCH v9 0/4] hash: add SVE support for bulk key lookup
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
                     ` (3 preceding siblings ...)
  2024-04-30 16:27   ` [PATCH v9 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-06-14 13:43   ` David Marchand
  2024-06-18 15:55     ` Konstantin Ananyev
  2024-06-27 14:48   ` Thomas Monjalon
  5 siblings, 1 reply; 73+ messages in thread
From: David Marchand @ 2024-06-14 13:43 UTC (permalink / raw)
  To: Bruce Richardson, Vladimir Medvedkin, Konstantin Ananyev
  Cc: dev, nd, Yoan Picchi
On Tue, Apr 30, 2024 at 6:28 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
>
> This patchset adds SVE support for the signature comparison in the cuckoo
> hash lookup and improves the existing NEON implementation. These
> optimizations required changes to the data format and signature of the
> relevant functions to support dense hitmasks (no padding) and having the
> primary and secondary hitmasks interleaved instead of being in their own
> array each.
>
> Benchmarking the cuckoo hash perf test, I observed this effect on speed:
>   There are no significant changes on Intel (ran on Sapphire Rapids)
>   Neon is up to 7-10% faster (ran on ampere altra)
>   128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
>     3 cloud instance)
>   256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
>     3 cloud instance)
>
> V2->V3:
>   Remove a redundant if in the test
>   Change a couple int to uint16_t in compare_signatures_dense
>   Several codding-style fix
>
> V3->V4:
>   Rebase
>
> V4->V5:
>   Commit message
>
> V5->V6:
>   Move the arch-specific code into new arch-specific files
>   Isolate the data struture refactor from adding SVE
>
> V6->V7:
>   Commit message
>   Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
>
> V7->V8:
>   Commit message
>   Typos and missing spaces
>
> V8->V9:
>   Use __rte_unused instead of (void)
>   Fix an indentation mistake
>
> Yoan Picchi (4):
>   hash: pack the hitmask for hash in bulk lookup
>   hash: optimize compare signature for NEON
>   test/hash: check bulk lookup of keys after collision
>   hash: add SVE support for bulk key lookup
>
>  .mailmap                                  |   2 +
>  app/test/test_hash.c                      |  99 ++++++++---
>  lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
>  lib/hash/arch/common/compare_signatures.h |  37 ++++
>  lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>  lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
>  lib/hash/rte_cuckoo_hash.h                |   1 +
>  7 files changed, 393 insertions(+), 115 deletions(-)
>  create mode 100644 lib/hash/arch/arm/compare_signatures.h
>  create mode 100644 lib/hash/arch/common/compare_signatures.h
>  create mode 100644 lib/hash/arch/x86/compare_signatures.h
>
Can any of you have a look at this series?
Thanks.
-- 
David Marchand
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * RE: [PATCH v9 0/4] hash: add SVE support for bulk key lookup
  2024-06-14 13:43   ` [PATCH v9 0/4] " David Marchand
@ 2024-06-18 15:55     ` Konstantin Ananyev
  0 siblings, 0 replies; 73+ messages in thread
From: Konstantin Ananyev @ 2024-06-18 15:55 UTC (permalink / raw)
  To: David Marchand, Bruce Richardson, Vladimir Medvedkin; +Cc: dev, nd, Yoan Picchi
Hi David,
> > This patchset adds SVE support for the signature comparison in the cuckoo
> > hash lookup and improves the existing NEON implementation. These
> > optimizations required changes to the data format and signature of the
> > relevant functions to support dense hitmasks (no padding) and having the
> > primary and secondary hitmasks interleaved instead of being in their own
> > array each.
> >
> > Benchmarking the cuckoo hash perf test, I observed this effect on speed:
> >   There are no significant changes on Intel (ran on Sapphire Rapids)
> >   Neon is up to 7-10% faster (ran on ampere altra)
> >   128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
> >     3 cloud instance)
> >   256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
> >     3 cloud instance)
> >
> > V2->V3:
> >   Remove a redundant if in the test
> >   Change a couple int to uint16_t in compare_signatures_dense
> >   Several codding-style fix
> >
> > V3->V4:
> >   Rebase
> >
> > V4->V5:
> >   Commit message
> >
> > V5->V6:
> >   Move the arch-specific code into new arch-specific files
> >   Isolate the data struture refactor from adding SVE
> >
> > V6->V7:
> >   Commit message
> >   Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
> >
> > V7->V8:
> >   Commit message
> >   Typos and missing spaces
> >
> > V8->V9:
> >   Use __rte_unused instead of (void)
> >   Fix an indentation mistake
> >
> > Yoan Picchi (4):
> >   hash: pack the hitmask for hash in bulk lookup
> >   hash: optimize compare signature for NEON
> >   test/hash: check bulk lookup of keys after collision
> >   hash: add SVE support for bulk key lookup
> >
> >  .mailmap                                  |   2 +
> >  app/test/test_hash.c                      |  99 ++++++++---
> >  lib/hash/arch/arm/compare_signatures.h    | 117 +++++++++++++
> >  lib/hash/arch/common/compare_signatures.h |  37 ++++
> >  lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
> >  lib/hash/rte_cuckoo_hash.c                | 199 ++++++++++++----------
> >  lib/hash/rte_cuckoo_hash.h                |   1 +
> >  7 files changed, 393 insertions(+), 115 deletions(-)
> >  create mode 100644 lib/hash/arch/arm/compare_signatures.h
> >  create mode 100644 lib/hash/arch/common/compare_signatures.h
> >  create mode 100644 lib/hash/arch/x86/compare_signatures.h
> >
> 
> Can any of you have a look at this series?
> Thanks.
It looks ok to me.
The only un-processed comment I have about it, from v7:
Ok, but before that, a 'generic' one (non-x86 and non-ARM) used 'sparse' one, correct?
If so, then probably need to outline it a bit more in patch comments and might be even release notes.
At least that would be my expectations, probably hash lib maintainers need to say what is the best way here.
The code refactoring itself - LGTM.
 
https://inbox.dpdk.org/dev/3cfce8e3b128473096e1d43683fbe6f0@huawei.com/
^ permalink raw reply	[flat|nested] 73+ messages in thread 
 
- * Re: [PATCH v9 0/4] hash: add SVE support for bulk key lookup
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
                     ` (4 preceding siblings ...)
  2024-06-14 13:43   ` [PATCH v9 0/4] " David Marchand
@ 2024-06-27 14:48   ` Thomas Monjalon
  5 siblings, 0 replies; 73+ messages in thread
From: Thomas Monjalon @ 2024-06-27 14:48 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: dev, nd, david.marchand, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin
30/04/2024 18:27, Yoan Picchi:
> This patchset adds SVE support for the signature comparison in the cuckoo
> hash lookup and improves the existing NEON implementation. These
> optimizations required changes to the data format and signature of the
> relevant functions to support dense hitmasks (no padding) and having the
> primary and secondary hitmasks interleaved instead of being in their own
> array each.
> 
> Benchmarking the cuckoo hash perf test, I observed this effect on speed:
>   There are no significant changes on Intel (ran on Sapphire Rapids)
>   Neon is up to 7-10% faster (ran on ampere altra)
>   128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
>     3 cloud instance)
>   256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
>     3 cloud instance)
> 
> V2->V3:
>   Remove a redundant if in the test
>   Change a couple int to uint16_t in compare_signatures_dense
>   Several codding-style fix
> 
> V3->V4:
>   Rebase
> 
> V4->V5:
>   Commit message
> 
> V5->V6:
>   Move the arch-specific code into new arch-specific files
>   Isolate the data struture refactor from adding SVE
> 
> V6->V7:
>   Commit message
>   Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
> 
> V7->V8:
>   Commit message
>   Typos and missing spaces
> 
> V8->V9:
>   Use __rte_unused instead of (void)
>   Fix an indentation mistake
Waiting for a new version after comments sent in June please.
Note: we didn't have a review from the lib maintainers.
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v10 0/4] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (8 preceding siblings ...)
  2024-04-30 16:27 ` [PATCH v9 0/4] " Yoan Picchi
@ 2024-07-03 17:13 ` Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 more replies)
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
  11 siblings, 4 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-03 17:13 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
V7->V8:
  Commit message
  Typos and missing spaces
V8->V9:
  Use __rte_unused instead of (void)
  Fix an indentation mistake
V9->V10:
  Fix more formating and indentation
  Move the new compare signature file directly in hash instead of being
    in a new subdir
  Re-order includes
  Remove duplicated static check
  Move rte_hash_sig_compare_function's definition into a private header
Yoan Picchi (4):
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/compare_signatures_arm_pvt.h     | 117 +++++++++++++
 lib/hash/compare_signatures_generic_pvt.h |  37 ++++
 lib/hash/compare_signatures_x86_pvt.h     |  49 ++++++
 lib/hash/hash_sig_cmp_func_pvt.h          |  20 +++
 lib/hash/rte_cuckoo_hash.c                | 197 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |  10 +-
 8 files changed, 407 insertions(+), 124 deletions(-)
 create mode 100644 lib/hash/compare_signatures_arm_pvt.h
 create mode 100644 lib/hash/compare_signatures_generic_pvt.h
 create mode 100644 lib/hash/compare_signatures_x86_pvt.h
 create mode 100644 lib/hash/hash_sig_cmp_func_pvt.h
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-07-03 17:13 ` [PATCH v10 " Yoan Picchi
@ 2024-07-03 17:13   ` Yoan Picchi
  2024-07-04 20:31     ` David Marchand
  2024-07-03 17:13   ` [PATCH v10 2/4] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-07-03 17:13 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   1 +
 lib/hash/compare_signatures_arm_pvt.h     |  60 +++++++
 lib/hash/compare_signatures_generic_pvt.h |  37 +++++
 lib/hash/compare_signatures_x86_pvt.h     |  49 ++++++
 lib/hash/hash_sig_cmp_func_pvt.h          |  20 +++
 lib/hash/rte_cuckoo_hash.c                | 190 +++++++++++-----------
 lib/hash/rte_cuckoo_hash.h                |  10 +-
 7 files changed, 267 insertions(+), 100 deletions(-)
 create mode 100644 lib/hash/compare_signatures_arm_pvt.h
 create mode 100644 lib/hash/compare_signatures_generic_pvt.h
 create mode 100644 lib/hash/compare_signatures_x86_pvt.h
 create mode 100644 lib/hash/hash_sig_cmp_func_pvt.h
diff --git a/.mailmap b/.mailmap
index f76037213d..ec525981fe 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1661,6 +1661,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
new file mode 100644
index 0000000000..e83bae9912
--- /dev/null
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+#include "hash_sig_cmp_func_pvt.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+		"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/compare_signatures_generic_pvt.h b/lib/hash/compare_signatures_generic_pvt.h
new file mode 100644
index 0000000000..18c2f651c4
--- /dev/null
+++ b/lib/hash/compare_signatures_generic_pvt.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+#include "hash_sig_cmp_func_pvt.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			__rte_unused enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+			"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+		*hitmask_buffer |= ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/compare_signatures_x86_pvt.h b/lib/hash/compare_signatures_x86_pvt.h
new file mode 100644
index 0000000000..932912ba19
--- /dev/null
+++ b/lib/hash/compare_signatures_x86_pvt.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+#include "hash_sig_cmp_func_pvt.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
+			(__m128i const *)prim_bkt->sig_current), _mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
+			(__m128i const *)sec_bkt->sig_current), _mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |= (sig == prim_bkt->sig_current[i]) << (i << 1);
+			*sec_hash_matches |= (sig == sec_bkt->sig_current[i]) << (i << 1);
+		}
+	}
+}
diff --git a/lib/hash/hash_sig_cmp_func_pvt.h b/lib/hash/hash_sig_cmp_func_pvt.h
new file mode 100644
index 0000000000..d8d2fbffaf
--- /dev/null
+++ b/lib/hash/hash_sig_cmp_func_pvt.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Arm Limited
+ */
+
+#ifndef _SIG_CMP_FUNC_H_
+#define _SIG_CMP_FUNC_H_
+
+/** Enum used to select the implementation of the signature comparison function to use
+ * eg: A system supporting SVE might want to use a NEON implementation.
+ * Those may change and are for internal use only
+ */
+enum rte_hash_sig_compare_function {
+	RTE_HASH_COMPARE_SCALAR = 0,
+	RTE_HASH_COMPARE_SSE,
+	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
+	RTE_HASH_COMPARE_NUM
+};
+
+#endif
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index d87aa52b5b..61cc12d83b 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -32,6 +32,15 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 	RTE_LOG_LINE(level, HASH, "" __VA_ARGS__)
 
 #include "rte_cuckoo_hash.h"
+#include "hash_sig_cmp_func_pvt.h"
+
+#if defined(__ARM_NEON)
+#include "compare_signatures_arm_pvt.h"
+#elif defined(__SSE2__)
+#include "compare_signatures_x86_pvt.h"
+#else
+#include "compare_signatures_generic_pvt.h"
+#endif
 
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
@@ -1880,63 +1889,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1947,22 +1899,41 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1973,10 +1944,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1990,10 +1961,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2015,13 +1994,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2044,7 +2023,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2094,11 +2073,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2112,14 +2100,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2130,10 +2130,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2146,10 +2146,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2175,13 +2183,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2208,7 +2216,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..26a992419a 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -134,14 +134,6 @@ struct rte_hash_key {
 	char key[0];
 };
 
-/* All different signature compare functions */
-enum rte_hash_sig_compare_function {
-	RTE_HASH_COMPARE_SCALAR = 0,
-	RTE_HASH_COMPARE_SSE,
-	RTE_HASH_COMPARE_NEON,
-	RTE_HASH_COMPARE_NUM
-};
-
 /** Bucket structure */
 struct __rte_cache_aligned rte_hash_bucket {
 	uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES];
@@ -199,7 +191,7 @@ struct __rte_cache_aligned rte_hash {
 	/**< Custom function used to compare keys. */
 	enum cmp_jump_table_case cmp_jump_table_idx;
 	/**< Indicates which compare function to use. */
-	enum rte_hash_sig_compare_function sig_cmp_fn;
+	unsigned int sig_cmp_fn;
 	/**< Indicates which signature compare function to use. */
 	uint32_t bucket_bitmask;
 	/**< Bitmask for getting bucket index from hash signature. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-07-03 17:13   ` [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-07-04 20:31     ` David Marchand
  2024-07-05 17:43       ` Yoan Picchi
  0 siblings, 1 reply; 73+ messages in thread
From: David Marchand @ 2024-07-04 20:31 UTC (permalink / raw)
  To: Yoan Picchi
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown
Hello Yoan,
On Wed, Jul 3, 2024 at 7:13 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
>
> Current hitmask includes padding due to Intel's SIMD
> implementation detail. This patch allows non Intel SIMD
> implementations to benefit from a dense hitmask.
> In addition, the new dense hitmask interweave the primary
> and secondary matches which allow a better cache usage and
> enable future improvements for the SIMD implementations
> The default non SIMD path now use this dense mask.
>
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
This patch does too many things at the same time.
There is code movement and behavior modifications all mixed in.
As there was still no review from the lib maintainer... I am going a
bit more in depth this time.
Please split this patch to make it less hard to understand.
I can see the need for at least one patch for isolating the change on
sig_cmp_fn from the exposed API, then one patch for moving the code to
per arch headers with *no behavior change*, and one patch for
introducing/switching to "dense hitmask".
More comments below.
> ---
>  .mailmap                                  |   1 +
>  lib/hash/compare_signatures_arm_pvt.h     |  60 +++++++
>  lib/hash/compare_signatures_generic_pvt.h |  37 +++++
>  lib/hash/compare_signatures_x86_pvt.h     |  49 ++++++
>  lib/hash/hash_sig_cmp_func_pvt.h          |  20 +++
>  lib/hash/rte_cuckoo_hash.c                | 190 +++++++++++-----------
>  lib/hash/rte_cuckoo_hash.h                |  10 +-
>  7 files changed, 267 insertions(+), 100 deletions(-)
>  create mode 100644 lib/hash/compare_signatures_arm_pvt.h
>  create mode 100644 lib/hash/compare_signatures_generic_pvt.h
>  create mode 100644 lib/hash/compare_signatures_x86_pvt.h
>  create mode 100644 lib/hash/hash_sig_cmp_func_pvt.h
>
> diff --git a/.mailmap b/.mailmap
> index f76037213d..ec525981fe 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -1661,6 +1661,7 @@ Yixue Wang <yixue.wang@intel.com>
>  Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>  Yi Zhang <zhang.yi75@zte.com.cn>
>  Yoann Desmouceaux <ydesmouc@cisco.com>
> +Yoan Picchi <yoan.picchi@arm.com>
>  Yogesh Jangra <yogesh.jangra@intel.com>
>  Yogev Chaimovich <yogev@cgstowernetworks.com>
>  Yongjie Gu <yongjiex.gu@intel.com>
> diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
> new file mode 100644
> index 0000000000..e83bae9912
> --- /dev/null
> +++ b/lib/hash/compare_signatures_arm_pvt.h
I guess pvt stands for private.
No need for such suffix, this header won't be exported in any case.
> @@ -0,0 +1,60 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * Arm's version uses a densely packed hitmask buffer:
> + * Every bit is in use.
> + */
Please put a header guard.
#ifndef <UPPERCASE_HEADER_NAME>_H
#define <UPPERCASE_HEADER_NAME>_H
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +
> +#include "rte_cuckoo_hash.h"
> +#include "hash_sig_cmp_func_pvt.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +                       const uint16_t *prim_bucket_sigs,
> +                       const uint16_t *sec_bucket_sigs,
> +                       uint16_t sig,
> +                       enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +
> +       static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
> +               "hitmask_buffer must be wide enough to fit a dense hitmask");
> +
> +       /* For match mask every bits indicates the match */
> +       switch (sig_cmp_fn) {
> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> +       case RTE_HASH_COMPARE_NEON: {
> +               uint16x8_t vmat, vsig, x;
> +               int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> +               uint16_t low, high;
> +
> +               vsig = vld1q_dup_u16((uint16_t const *)&sig);
> +               /* Compare all signatures in the primary bucket */
> +               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +               low = (uint16_t)(vaddvq_u16(x));
> +               /* Compare all signatures in the secondary bucket */
> +               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +               high = (uint16_t)(vaddvq_u16(x));
> +               *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +
> +               }
> +               break;
> +#endif
> +       default:
> +               for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +                       *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
> +                       *hitmask_buffer |=
> +                               ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +               }
> +       }
> +}
IIRC, this code is copied in all three headers.
It is a common scalar version, so the ARM code could simply call the
"generic" implementation rather than copy/paste.
[snip]
> diff --git a/lib/hash/compare_signatures_x86_pvt.h b/lib/hash/compare_signatures_x86_pvt.h
> new file mode 100644
> index 0000000000..932912ba19
> --- /dev/null
> +++ b/lib/hash/compare_signatures_x86_pvt.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * x86's version uses a sparsely packed hitmask buffer:
> + * Every other bit is padding.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +
> +#include "rte_cuckoo_hash.h"
> +#include "hash_sig_cmp_func_pvt.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 0
> +
> +static inline void
> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> +                       const struct rte_hash_bucket *prim_bkt,
> +                       const struct rte_hash_bucket *sec_bkt,
> +                       uint16_t sig,
> +                       enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +       /* For match mask the first bit of every two bits indicates the match */
> +       switch (sig_cmp_fn) {
> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
The check on RTE_HASH_BUCKET_ENTRIES <= 8 seems new.
It was not present in the previous implementation for SSE2, and this
difference is not explained.
> +       case RTE_HASH_COMPARE_SSE:
> +               /* Compare all signatures in the bucket */
> +               *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
> +                       (__m128i const *)prim_bkt->sig_current), _mm_set1_epi16(sig)));
> +               /* Extract the even-index bits only */
> +               *prim_hash_matches &= 0x5555;
> +               /* Compare all signatures in the bucket */
> +               *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
> +                       (__m128i const *)sec_bkt->sig_current), _mm_set1_epi16(sig)));
> +               /* Extract the even-index bits only */
> +               *sec_hash_matches &= 0x5555;
> +               break;
> +#endif /* defined(__SSE2__) */
> +       default:
> +               for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +                       *prim_hash_matches |= (sig == prim_bkt->sig_current[i]) << (i << 1);
> +                       *sec_hash_matches |= (sig == sec_bkt->sig_current[i]) << (i << 1);
> +               }
> +       }
> +}
> diff --git a/lib/hash/hash_sig_cmp_func_pvt.h b/lib/hash/hash_sig_cmp_func_pvt.h
> new file mode 100644
> index 0000000000..d8d2fbffaf
> --- /dev/null
> +++ b/lib/hash/hash_sig_cmp_func_pvt.h
Please rename as compare_signatures.h or maybe a simpler option is to
move this enum declaration in rte_cuckoo_hash.c before including the
per arch headers.
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2024 Arm Limited
> + */
> +
> +#ifndef _SIG_CMP_FUNC_H_
> +#define _SIG_CMP_FUNC_H_
If keeping a header, this guard must reflect the file name.
> +
> +/** Enum used to select the implementation of the signature comparison function to use
/* is enough, doxygen only parses public headers.
> + * eg: A system supporting SVE might want to use a NEON implementation.
> + * Those may change and are for internal use only
> + */
> +enum rte_hash_sig_compare_function {
> +       RTE_HASH_COMPARE_SCALAR = 0,
> +       RTE_HASH_COMPARE_SSE,
> +       RTE_HASH_COMPARE_NEON,
> +       RTE_HASH_COMPARE_SVE,
> +       RTE_HASH_COMPARE_NUM
> +};
> +
> +#endif
[snip]
> diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
> index a528f1d1a0..26a992419a 100644
> --- a/lib/hash/rte_cuckoo_hash.h
> +++ b/lib/hash/rte_cuckoo_hash.h
> @@ -134,14 +134,6 @@ struct rte_hash_key {
>         char key[0];
>  };
>
> -/* All different signature compare functions */
> -enum rte_hash_sig_compare_function {
> -       RTE_HASH_COMPARE_SCALAR = 0,
> -       RTE_HASH_COMPARE_SSE,
> -       RTE_HASH_COMPARE_NEON,
> -       RTE_HASH_COMPARE_NUM
> -};
> -
>  /** Bucket structure */
>  struct __rte_cache_aligned rte_hash_bucket {
>         uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES];
> @@ -199,7 +191,7 @@ struct __rte_cache_aligned rte_hash {
>         /**< Custom function used to compare keys. */
>         enum cmp_jump_table_case cmp_jump_table_idx;
>         /**< Indicates which compare function to use. */
> -       enum rte_hash_sig_compare_function sig_cmp_fn;
> +       unsigned int sig_cmp_fn;
From an ABI perspective, it looks ok.
We may be breaking users that would inspect this public object, but I
think it is ok.
In any case, put this change in a separate patch so it is more visible.
>         /**< Indicates which signature compare function to use. */
>         uint32_t bucket_bitmask;
>         /**< Bitmask for getting bucket index from hash signature. */
> --
> 2.25.1
>
-- 
David Marchand
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-07-04 20:31     ` David Marchand
@ 2024-07-05 17:43       ` Yoan Picchi
  2024-07-07 12:08         ` Thomas Monjalon
  0 siblings, 1 reply; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:43 UTC (permalink / raw)
  To: David Marchand
  Cc: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin, dev, nd, Ruifeng Wang, Nathan Brown
I'll push a v11 tonight. There is a couple of comments I disagree with 
tough:
On 7/4/24 21:31, David Marchand wrote:
> Hello Yoan,
> 
> On Wed, Jul 3, 2024 at 7:13 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
>>
>> Current hitmask includes padding due to Intel's SIMD
>> implementation detail. This patch allows non Intel SIMD
>> implementations to benefit from a dense hitmask.
>> In addition, the new dense hitmask interweave the primary
>> and secondary matches which allow a better cache usage and
>> enable future improvements for the SIMD implementations
>> The default non SIMD path now use this dense mask.
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> 
> This patch does too many things at the same time.
> There is code movement and behavior modifications all mixed in.
> 
> As there was still no review from the lib maintainer... I am going a
> bit more in depth this time.
> Please split this patch to make it less hard to understand.
> 
> I can see the need for at least one patch for isolating the change on
> sig_cmp_fn from the exposed API, then one patch for moving the code to
> per arch headers with *no behavior change*, and one patch for
> introducing/switching to "dense hitmask".
> 
> More comments below.
> 
> 
>> ---
>>   .mailmap                                  |   1 +
>>   lib/hash/compare_signatures_arm_pvt.h     |  60 +++++++
>>   lib/hash/compare_signatures_generic_pvt.h |  37 +++++
>>   lib/hash/compare_signatures_x86_pvt.h     |  49 ++++++
>>   lib/hash/hash_sig_cmp_func_pvt.h          |  20 +++
>>   lib/hash/rte_cuckoo_hash.c                | 190 +++++++++++-----------
>>   lib/hash/rte_cuckoo_hash.h                |  10 +-
>>   7 files changed, 267 insertions(+), 100 deletions(-)
>>   create mode 100644 lib/hash/compare_signatures_arm_pvt.h
>>   create mode 100644 lib/hash/compare_signatures_generic_pvt.h
>>   create mode 100644 lib/hash/compare_signatures_x86_pvt.h
>>   create mode 100644 lib/hash/hash_sig_cmp_func_pvt.h
>>
>> diff --git a/.mailmap b/.mailmap
>> index f76037213d..ec525981fe 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -1661,6 +1661,7 @@ Yixue Wang <yixue.wang@intel.com>
>>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>>   Yi Zhang <zhang.yi75@zte.com.cn>
>>   Yoann Desmouceaux <ydesmouc@cisco.com>
>> +Yoan Picchi <yoan.picchi@arm.com>
>>   Yogesh Jangra <yogesh.jangra@intel.com>
>>   Yogev Chaimovich <yogev@cgstowernetworks.com>
>>   Yongjie Gu <yongjiex.gu@intel.com>
>> diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
>> new file mode 100644
>> index 0000000000..e83bae9912
>> --- /dev/null
>> +++ b/lib/hash/compare_signatures_arm_pvt.h
> 
> I guess pvt stands for private.
> No need for such suffix, this header won't be exported in any case.
pvt do stand for private, yes. I had a look at the other lib and what 
they used to state a header as private. Several (rcu, ring and stack) 
use _pvt so it looks like that's might be the standard? If no, then how 
am I supposed to differentiate a public and a private header?
> 
> 
>> @@ -0,0 +1,60 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * Arm's version uses a densely packed hitmask buffer:
>> + * Every bit is in use.
>> + */
> 
> Please put a header guard.
> 
> #ifndef <UPPERCASE_HEADER_NAME>_H
> #define <UPPERCASE_HEADER_NAME>_H
> 
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +
>> +#include "rte_cuckoo_hash.h"
>> +#include "hash_sig_cmp_func_pvt.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +                       const uint16_t *prim_bucket_sigs,
>> +                       const uint16_t *sec_bucket_sigs,
>> +                       uint16_t sig,
>> +                       enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +
>> +       static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
>> +               "hitmask_buffer must be wide enough to fit a dense hitmask");
>> +
>> +       /* For match mask every bits indicates the match */
>> +       switch (sig_cmp_fn) {
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>> +       case RTE_HASH_COMPARE_NEON: {
>> +               uint16x8_t vmat, vsig, x;
>> +               int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> +               uint16_t low, high;
>> +
>> +               vsig = vld1q_dup_u16((uint16_t const *)&sig);
>> +               /* Compare all signatures in the primary bucket */
>> +               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +               low = (uint16_t)(vaddvq_u16(x));
>> +               /* Compare all signatures in the secondary bucket */
>> +               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +               high = (uint16_t)(vaddvq_u16(x));
>> +               *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +
>> +               }
>> +               break;
>> +#endif
>> +       default:
>> +               for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +                       *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
>> +                       *hitmask_buffer |=
>> +                               ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +               }
>> +       }
>> +}
> 
> IIRC, this code is copied in all three headers.
> It is a common scalar version, so the ARM code could simply call the
> "generic" implementation rather than copy/paste.
Out of the three files, only two versions are the same: generic and arm. 
Intel's version do have some padding added (given it's sparse).
I prefer to keep a scalar version in the arm implementation because 
that's what match the legacy implementation. We used to be able to 
choose (at runtime) to use the scalar path even if we had neon. In 
practice the choice ends up being made from #defines, but as far as this 
function goes, it is a runtime decision.
[snip]
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup
  2024-07-05 17:43       ` Yoan Picchi
@ 2024-07-07 12:08         ` Thomas Monjalon
  0 siblings, 0 replies; 73+ messages in thread
From: Thomas Monjalon @ 2024-07-07 12:08 UTC (permalink / raw)
  To: David Marchand, Yoan Picchi
  Cc: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin,
	dev, nd, Ruifeng Wang, Nathan Brown
05/07/2024 19:43, Yoan Picchi:
> On 7/4/24 21:31, David Marchand wrote:
> > On Wed, Jul 3, 2024 at 7:13 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
> >> --- /dev/null
> >> +++ b/lib/hash/compare_signatures_arm_pvt.h
> > 
> > I guess pvt stands for private.
> > No need for such suffix, this header won't be exported in any case.
> 
> pvt do stand for private, yes. I had a look at the other lib and what 
> they used to state a header as private. Several (rcu, ring and stack) 
> use _pvt so it looks like that's might be the standard? If no, then how 
> am I supposed to differentiate a public and a private header?
Public headers are prefixed with rte_
We should not use _pvt
> >> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> >> +       case RTE_HASH_COMPARE_NEON: {
> >> +               uint16x8_t vmat, vsig, x;
> >> +               int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> >> +               uint16_t low, high;
> >> +
> >> +               vsig = vld1q_dup_u16((uint16_t const *)&sig);
> >> +               /* Compare all signatures in the primary bucket */
> >> +               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
> >> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +               low = (uint16_t)(vaddvq_u16(x));
> >> +               /* Compare all signatures in the secondary bucket */
> >> +               vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
> >> +               x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +               high = (uint16_t)(vaddvq_u16(x));
> >> +               *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> >> +
> >> +               }
> >> +               break;
> >> +#endif
> >> +       default:
> >> +               for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +                       *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
> >> +                       *hitmask_buffer |=
> >> +                               ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +               }
> >> +       }
> >> +}
> > 
> > IIRC, this code is copied in all three headers.
> > It is a common scalar version, so the ARM code could simply call the
> > "generic" implementation rather than copy/paste.
> 
> Out of the three files, only two versions are the same: generic and arm. 
> Intel's version do have some padding added (given it's sparse).
> I prefer to keep a scalar version in the arm implementation because 
> that's what match the legacy implementation. We used to be able to 
> choose (at runtime) to use the scalar path even if we had neon. In 
> practice the choice ends up being made from #defines, but as far as this 
> function goes, it is a runtime decision.
I have no strong opinion.
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
 
 
- * [PATCH v10 2/4] hash: optimize compare signature for NEON
  2024-07-03 17:13 ` [PATCH v10 " Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-07-03 17:13   ` Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-03 17:13 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index e83bae9912..1d5464c4ce 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -32,21 +32,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v10 3/4] test/hash: check bulk lookup of keys after collision
  2024-07-03 17:13 ` [PATCH v10 " Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 1/4] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 2/4] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-07-03 17:13   ` Yoan Picchi
  2024-07-03 17:13   ` [PATCH v10 4/4] hash: add SVE support for bulk key lookup Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-03 17:13 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap             |  1 +
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 77 insertions(+), 23 deletions(-)
diff --git a/.mailmap b/.mailmap
index ec525981fe..41a8a99a7c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -505,6 +505,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index 24d3b547ad..ab3b37de3f 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v10 4/4] hash: add SVE support for bulk key lookup
  2024-07-03 17:13 ` [PATCH v10 " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-07-03 17:13   ` [PATCH v10 3/4] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-07-03 17:13   ` Yoan Picchi
  3 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-03 17:13 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h | 57 +++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c            |  7 +++-
 2 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index 1d5464c4ce..efec78afb0 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -49,6 +49,63 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred, primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+					&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+					&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 61cc12d83b..e5831ad146 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -452,8 +452,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v11 0/7] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (9 preceding siblings ...)
  2024-07-03 17:13 ` [PATCH v10 " Yoan Picchi
@ 2024-07-05 17:45 ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 1/7] hash: make compare signature function enum private Yoan Picchi
                     ` (6 more replies)
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
  11 siblings, 7 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
V7->V8:
  Commit message
  Typos and missing spaces
V8->V9:
  Use __rte_unused instead of (void)
  Fix an indentation mistake
V9->V10:
  Fix more formating and indentation
  Move the new compare signature file directly in hash instead of being
    in a new subdir
  Re-order includes
  Remove duplicated static check
  Move rte_hash_sig_compare_function's definition into a private header
V10->V11:
  Split the "pack the hitmask" commit into four commits:
    Move the compare function enum out of the ABI
    Move the compare function implementations into arch-specific files
    Add a missing check on RTE_HASH_BUCKET_ENTRIES in case we change it
      in the future
    Implement the dense hitmask
  Add missing header guards
  Move compare function enum into cuckoo_hash.c instead of its own header.
Yoan Picchi (7):
  hash: make compare signature function enum private
  hash: split compare signature into arch-specific files
  hash: add a check on hash entry max size
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                                  |   2 +
 app/test/test_hash.c                      |  99 ++++++++---
 lib/hash/compare_signatures_arm_pvt.h     | 121 +++++++++++++
 lib/hash/compare_signatures_generic_pvt.h |  40 +++++
 lib/hash/compare_signatures_x86_pvt.h     |  55 ++++++
 lib/hash/rte_cuckoo_hash.c                | 207 ++++++++++++----------
 lib/hash/rte_cuckoo_hash.h                |  10 +-
 7 files changed, 410 insertions(+), 124 deletions(-)
 create mode 100644 lib/hash/compare_signatures_arm_pvt.h
 create mode 100644 lib/hash/compare_signatures_generic_pvt.h
 create mode 100644 lib/hash/compare_signatures_x86_pvt.h
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 1/7] hash: make compare signature function enum private
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 2/7] hash: split compare signature into arch-specific files Yoan Picchi
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi
enum rte_hash_sig_compare_function is only used internally. This
patch move it out of the public ABI and into the C file.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 10 ++++++++++
 lib/hash/rte_cuckoo_hash.h | 10 +---------
 2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index d87aa52b5b..e1d50e7d40 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,16 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+/* Enum used to select the implementation of the signature comparison function to use
+ * eg: A system supporting SVE might want to use a NEON or scalar implementation.
+ */
+enum rte_hash_sig_compare_function {
+	RTE_HASH_COMPARE_SCALAR = 0,
+	RTE_HASH_COMPARE_SSE,
+	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_NUM
+};
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..26a992419a 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -134,14 +134,6 @@ struct rte_hash_key {
 	char key[0];
 };
 
-/* All different signature compare functions */
-enum rte_hash_sig_compare_function {
-	RTE_HASH_COMPARE_SCALAR = 0,
-	RTE_HASH_COMPARE_SSE,
-	RTE_HASH_COMPARE_NEON,
-	RTE_HASH_COMPARE_NUM
-};
-
 /** Bucket structure */
 struct __rte_cache_aligned rte_hash_bucket {
 	uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES];
@@ -199,7 +191,7 @@ struct __rte_cache_aligned rte_hash {
 	/**< Custom function used to compare keys. */
 	enum cmp_jump_table_case cmp_jump_table_idx;
 	/**< Indicates which compare function to use. */
-	enum rte_hash_sig_compare_function sig_cmp_fn;
+	unsigned int sig_cmp_fn;
 	/**< Indicates which signature compare function to use. */
 	uint32_t bucket_bitmask;
 	/**< Bitmask for getting bucket index from hash signature. */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 2/7] hash: split compare signature into arch-specific files
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 1/7] hash: make compare signature function enum private Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 3/7] hash: add a check on hash entry max size Yoan Picchi
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi
Move the compare_signatures function into architecture-specific files
They all have the default scalar option as an option if we disable
vectorisation.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 .mailmap                                  |  1 +
 lib/hash/compare_signatures_arm_pvt.h     | 55 +++++++++++++++++++
 lib/hash/compare_signatures_generic_pvt.h | 33 ++++++++++++
 lib/hash/compare_signatures_x86_pvt.h     | 48 +++++++++++++++++
 lib/hash/rte_cuckoo_hash.c                | 65 +++--------------------
 5 files changed, 145 insertions(+), 57 deletions(-)
 create mode 100644 lib/hash/compare_signatures_arm_pvt.h
 create mode 100644 lib/hash/compare_signatures_generic_pvt.h
 create mode 100644 lib/hash/compare_signatures_x86_pvt.h
diff --git a/.mailmap b/.mailmap
index f76037213d..ec525981fe 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1661,6 +1661,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
new file mode 100644
index 0000000000..80b6afb7a5
--- /dev/null
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+#ifndef _COMPARE_SIGNATURE_ARM_PVT_H_
+#define _COMPARE_SIGNATURE_ARM_PVT_H_
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+
+static inline void
+compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__ARM_NEON)
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+#endif
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
+
+#endif
diff --git a/lib/hash/compare_signatures_generic_pvt.h b/lib/hash/compare_signatures_generic_pvt.h
new file mode 100644
index 0000000000..43587adcef
--- /dev/null
+++ b/lib/hash/compare_signatures_generic_pvt.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+#ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_
+#define _COMPARE_SIGNATURE_GENERIC_PVT_H_
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+
+static inline void
+compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask the first bit of every two bits indicates the match */
+	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*prim_hash_matches |=
+			((sig == prim_bkt->sig_current[i]) << (i << 1));
+		*sec_hash_matches |=
+			((sig == sec_bkt->sig_current[i]) << (i << 1));
+	}
+}
+
+#endif
diff --git a/lib/hash/compare_signatures_x86_pvt.h b/lib/hash/compare_signatures_x86_pvt.h
new file mode 100644
index 0000000000..11a82aced9
--- /dev/null
+++ b/lib/hash/compare_signatures_x86_pvt.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+#ifndef _COMPARE_SIGNATURE_X86_PVT_H_
+#define _COMPARE_SIGNATURE_X86_PVT_H_
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+
+static inline void
+compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__)
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
+			(__m128i const *)prim_bkt->sig_current), _mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
+			(__m128i const *)sec_bkt->sig_current), _mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |= (sig == prim_bkt->sig_current[i]) << (i << 1);
+			*sec_hash_matches |= (sig == sec_bkt->sig_current[i]) << (i << 1);
+		}
+	}
+}
+
+#endif
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index e1d50e7d40..739f7927b8 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -43,6 +43,14 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_NUM
 };
 
+#if defined(__ARM_NEON)
+#include "compare_signatures_arm_pvt.h"
+#elif defined(__SSE2__)
+#include "compare_signatures_x86_pvt.h"
+#else
+#include "compare_signatures_generic_pvt.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1890,63 +1898,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 3/7] hash: add a check on hash entry max size
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 1/7] hash: make compare signature function enum private Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 2/7] hash: split compare signature into arch-specific files Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 4/7] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi
If were to change RTE_HASH_BUCKET_ENTRIES to be over 8, it would no longer
fit in the vector (8*16b=128b), therefore failing to check some of the
signatures. This patch adds a compile time check to fallback to scalar
code in this case.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h | 2 +-
 lib/hash/compare_signatures_x86_pvt.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index 80b6afb7a5..74b3286c95 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -23,7 +23,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
 		uint16x8_t vmat, vsig, x;
 		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
diff --git a/lib/hash/compare_signatures_x86_pvt.h b/lib/hash/compare_signatures_x86_pvt.h
index 11a82aced9..f77b37f1cd 100644
--- a/lib/hash/compare_signatures_x86_pvt.h
+++ b/lib/hash/compare_signatures_x86_pvt.h
@@ -23,7 +23,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 4/7] hash: pack the hitmask for hash in bulk lookup
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-07-05 17:45   ` [PATCH v11 3/7] hash: add a check on hash entry max size Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 5/7] hash: optimize compare signature for NEON Yoan Picchi
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h     |  47 ++++----
 lib/hash/compare_signatures_generic_pvt.h |  31 +++---
 lib/hash/compare_signatures_x86_pvt.h     |   9 +-
 lib/hash/rte_cuckoo_hash.c                | 124 ++++++++++++++++------
 4 files changed, 145 insertions(+), 66 deletions(-)
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index 74b3286c95..0fc657c49b 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -6,48 +6,57 @@
 #ifndef _COMPARE_SIGNATURE_ARM_PVT_H_
 #define _COMPARE_SIGNATURE_ARM_PVT_H_
 
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-	unsigned int i;
 
-	/* For match mask the first bit of every two bits indicates the match */
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+		"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
 		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
 
 		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
 		}
 		break;
 #endif
 	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
diff --git a/lib/hash/compare_signatures_generic_pvt.h b/lib/hash/compare_signatures_generic_pvt.h
index 43587adcef..1d065d4c28 100644
--- a/lib/hash/compare_signatures_generic_pvt.h
+++ b/lib/hash/compare_signatures_generic_pvt.h
@@ -6,27 +6,34 @@
 #ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_
 #define _COMPARE_SIGNATURE_GENERIC_PVT_H_
 
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
+			__rte_unused enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-		*prim_hash_matches |=
-			((sig == prim_bkt->sig_current[i]) << (i << 1));
-		*sec_hash_matches |=
-			((sig == sec_bkt->sig_current[i]) << (i << 1));
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+			"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+		*hitmask_buffer |= ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 	}
 }
 
diff --git a/lib/hash/compare_signatures_x86_pvt.h b/lib/hash/compare_signatures_x86_pvt.h
index f77b37f1cd..03e9c44e53 100644
--- a/lib/hash/compare_signatures_x86_pvt.h
+++ b/lib/hash/compare_signatures_x86_pvt.h
@@ -6,14 +6,21 @@
 #ifndef _COMPARE_SIGNATURE_X86_PVT_H_
 #define _COMPARE_SIGNATURE_X86_PVT_H_
 
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 0
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 739f7927b8..187918a05a 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1908,22 +1908,41 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1934,10 +1953,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1951,10 +1970,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1976,13 +2003,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2005,7 +2032,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2055,11 +2082,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2073,14 +2109,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2091,10 +2139,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2155,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2136,13 +2192,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2169,7 +2225,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 5/7] hash: optimize compare signature for NEON
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
                     ` (3 preceding siblings ...)
  2024-07-05 17:45   ` [PATCH v11 4/7] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 6/7] test/hash: check bulk lookup of keys after collision Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 7/7] hash: add SVE support for bulk key lookup Yoan Picchi
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index 0fc657c49b..0245fec26f 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -34,21 +34,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 6/7] test/hash: check bulk lookup of keys after collision
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
                     ` (4 preceding siblings ...)
  2024-07-05 17:45   ` [PATCH v11 5/7] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  2024-07-05 17:45   ` [PATCH v11 7/7] hash: add SVE support for bulk key lookup Yoan Picchi
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap             |  1 +
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 77 insertions(+), 23 deletions(-)
diff --git a/.mailmap b/.mailmap
index ec525981fe..41a8a99a7c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -505,6 +505,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index 24d3b547ad..ab3b37de3f 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v11 7/7] hash: add SVE support for bulk key lookup
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
                     ` (5 preceding siblings ...)
  2024-07-05 17:45   ` [PATCH v11 6/7] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-07-05 17:45   ` Yoan Picchi
  6 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-05 17:45 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/compare_signatures_arm_pvt.h | 57 +++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c            |  8 +++-
 2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/lib/hash/compare_signatures_arm_pvt.h b/lib/hash/compare_signatures_arm_pvt.h
index 0245fec26f..86843b8a8a 100644
--- a/lib/hash/compare_signatures_arm_pvt.h
+++ b/lib/hash/compare_signatures_arm_pvt.h
@@ -51,6 +51,63 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred, primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+					&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+					&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 187918a05a..c30ea13000 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -40,6 +40,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
@@ -461,8 +462,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
-- 
2.34.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
 
- * [PATCH v12 0/7] hash: add SVE support for bulk key lookup
  2023-10-20 16:51 [PATCH v2 0/4] hash: add SVE support for bulk key lookup Yoan Picchi
                   ` (10 preceding siblings ...)
  2024-07-05 17:45 ` [PATCH v11 0/7] " Yoan Picchi
@ 2024-07-08 12:14 ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 1/7] hash: make compare signature function enum private Yoan Picchi
                     ` (7 more replies)
  11 siblings, 8 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  Cc: dev, nd, Yoan Picchi
This patchset adds SVE support for the signature comparison in the cuckoo
hash lookup and improves the existing NEON implementation. These
optimizations required changes to the data format and signature of the
relevant functions to support dense hitmasks (no padding) and having the
primary and secondary hitmasks interleaved instead of being in their own
array each.
Benchmarking the cuckoo hash perf test, I observed this effect on speed:
  There are no significant changes on Intel (ran on Sapphire Rapids)
  Neon is up to 7-10% faster (ran on ampere altra)
  128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
    3 cloud instance)
  256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
    3 cloud instance)
V2->V3:
  Remove a redundant if in the test
  Change a couple int to uint16_t in compare_signatures_dense
  Several codding-style fix
V3->V4:
  Rebase
V4->V5:
  Commit message
V5->V6:
  Move the arch-specific code into new arch-specific files
  Isolate the data struture refactor from adding SVE
V6->V7:
  Commit message
  Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
V7->V8:
  Commit message
  Typos and missing spaces
V8->V9:
  Use __rte_unused instead of (void)
  Fix an indentation mistake
V9->V10:
  Fix more formating and indentation
  Move the new compare signature file directly in hash instead of being
    in a new subdir
  Re-order includes
  Remove duplicated static check
  Move rte_hash_sig_compare_function's definition into a private header
V10->V11:
  Split the "pack the hitmask" commit into four commits:
    Move the compare function enum out of the ABI
    Move the compare function implementations into arch-specific files
    Add a missing check on RTE_HASH_BUCKET_ENTRIES in case we change it
      in the future
    Implement the dense hitmask
  Add missing header guards
  Move compare function enum into cuckoo_hash.c instead of its own header.
V11->V12:
  Change the name of the compare function file (remove the _pvt suffix)
Yoan Picchi (7):
  hash: make compare signature function enum private
  hash: split compare signature into arch-specific files
  hash: add a check on hash entry max size
  hash: pack the hitmask for hash in bulk lookup
  hash: optimize compare signature for NEON
  test/hash: check bulk lookup of keys after collision
  hash: add SVE support for bulk key lookup
 .mailmap                              |   2 +
 app/test/test_hash.c                  |  99 +++++++++---
 lib/hash/compare_signatures_arm.h     | 121 +++++++++++++++
 lib/hash/compare_signatures_generic.h |  40 +++++
 lib/hash/compare_signatures_x86.h     |  55 +++++++
 lib/hash/rte_cuckoo_hash.c            | 207 ++++++++++++++------------
 lib/hash/rte_cuckoo_hash.h            |  10 +-
 7 files changed, 410 insertions(+), 124 deletions(-)
 create mode 100644 lib/hash/compare_signatures_arm.h
 create mode 100644 lib/hash/compare_signatures_generic.h
 create mode 100644 lib/hash/compare_signatures_x86.h
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 1/7] hash: make compare signature function enum private
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 2/7] hash: split compare signature into arch-specific files Yoan Picchi
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi
enum rte_hash_sig_compare_function is only used internally. This
patch move it out of the public ABI and into the C file.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 lib/hash/rte_cuckoo_hash.c | 10 ++++++++++
 lib/hash/rte_cuckoo_hash.h | 10 +---------
 2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index d87aa52b5b..e1d50e7d40 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,16 @@ RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+/* Enum used to select the implementation of the signature comparison function to use
+ * eg: A system supporting SVE might want to use a NEON or scalar implementation.
+ */
+enum rte_hash_sig_compare_function {
+	RTE_HASH_COMPARE_SCALAR = 0,
+	RTE_HASH_COMPARE_SSE,
+	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_NUM
+};
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index a528f1d1a0..26a992419a 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -134,14 +134,6 @@ struct rte_hash_key {
 	char key[0];
 };
 
-/* All different signature compare functions */
-enum rte_hash_sig_compare_function {
-	RTE_HASH_COMPARE_SCALAR = 0,
-	RTE_HASH_COMPARE_SSE,
-	RTE_HASH_COMPARE_NEON,
-	RTE_HASH_COMPARE_NUM
-};
-
 /** Bucket structure */
 struct __rte_cache_aligned rte_hash_bucket {
 	uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES];
@@ -199,7 +191,7 @@ struct __rte_cache_aligned rte_hash {
 	/**< Custom function used to compare keys. */
 	enum cmp_jump_table_case cmp_jump_table_idx;
 	/**< Indicates which compare function to use. */
-	enum rte_hash_sig_compare_function sig_cmp_fn;
+	unsigned int sig_cmp_fn;
 	/**< Indicates which signature compare function to use. */
 	uint32_t bucket_bitmask;
 	/**< Bitmask for getting bucket index from hash signature. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 2/7] hash: split compare signature into arch-specific files
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 1/7] hash: make compare signature function enum private Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 3/7] hash: add a check on hash entry max size Yoan Picchi
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi
Move the compare_signatures function into architecture-specific files
They all have the default scalar option as an option if we disable
vectorisation.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 .mailmap                              |  1 +
 lib/hash/compare_signatures_arm.h     | 55 +++++++++++++++++++++++
 lib/hash/compare_signatures_generic.h | 33 ++++++++++++++
 lib/hash/compare_signatures_x86.h     | 48 ++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c            | 65 ++++-----------------------
 5 files changed, 145 insertions(+), 57 deletions(-)
 create mode 100644 lib/hash/compare_signatures_arm.h
 create mode 100644 lib/hash/compare_signatures_generic.h
 create mode 100644 lib/hash/compare_signatures_x86.h
diff --git a/.mailmap b/.mailmap
index f76037213d..ec525981fe 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1661,6 +1661,7 @@ Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/compare_signatures_arm.h b/lib/hash/compare_signatures_arm.h
new file mode 100644
index 0000000000..80b6afb7a5
--- /dev/null
+++ b/lib/hash/compare_signatures_arm.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+#ifndef _COMPARE_SIGNATURE_ARM_PVT_H_
+#define _COMPARE_SIGNATURE_ARM_PVT_H_
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+
+static inline void
+compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__ARM_NEON)
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
+		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
+		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		}
+		break;
+#endif
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
+
+#endif
diff --git a/lib/hash/compare_signatures_generic.h b/lib/hash/compare_signatures_generic.h
new file mode 100644
index 0000000000..43587adcef
--- /dev/null
+++ b/lib/hash/compare_signatures_generic.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+#ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_
+#define _COMPARE_SIGNATURE_GENERIC_PVT_H_
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+
+static inline void
+compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask the first bit of every two bits indicates the match */
+	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*prim_hash_matches |=
+			((sig == prim_bkt->sig_current[i]) << (i << 1));
+		*sec_hash_matches |=
+			((sig == sec_bkt->sig_current[i]) << (i << 1));
+	}
+}
+
+#endif
diff --git a/lib/hash/compare_signatures_x86.h b/lib/hash/compare_signatures_x86.h
new file mode 100644
index 0000000000..11a82aced9
--- /dev/null
+++ b/lib/hash/compare_signatures_x86.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+#ifndef _COMPARE_SIGNATURE_X86_PVT_H_
+#define _COMPARE_SIGNATURE_X86_PVT_H_
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#include "rte_cuckoo_hash.h"
+
+static inline void
+compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	unsigned int i;
+
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__)
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
+			(__m128i const *)prim_bkt->sig_current), _mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
+			(__m128i const *)sec_bkt->sig_current), _mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif
+	default:
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |= (sig == prim_bkt->sig_current[i]) << (i << 1);
+			*sec_hash_matches |= (sig == sec_bkt->sig_current[i]) << (i << 1);
+		}
+	}
+}
+
+#endif
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index e1d50e7d40..c3256dff4c 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -43,6 +43,14 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_NUM
 };
 
+#if defined(__ARM_NEON)
+#include "compare_signatures_arm.h"
+#elif defined(__SSE2__)
+#include "compare_signatures_x86.h"
+#else
+#include "compare_signatures_generic.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1890,63 +1898,6 @@ rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 3/7] hash: add a check on hash entry max size
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 1/7] hash: make compare signature function enum private Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 2/7] hash: split compare signature into arch-specific files Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 4/7] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi
If were to change RTE_HASH_BUCKET_ENTRIES to be over 8, it would no longer
fit in the vector (8*16b=128b), therefore failing to check some of the
signatures. This patch adds a compile time check to fallback to scalar
code in this case.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
---
 lib/hash/compare_signatures_arm.h | 2 +-
 lib/hash/compare_signatures_x86.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/hash/compare_signatures_arm.h b/lib/hash/compare_signatures_arm.h
index 80b6afb7a5..74b3286c95 100644
--- a/lib/hash/compare_signatures_arm.h
+++ b/lib/hash/compare_signatures_arm.h
@@ -23,7 +23,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
 		uint16x8_t vmat, vsig, x;
 		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
diff --git a/lib/hash/compare_signatures_x86.h b/lib/hash/compare_signatures_x86.h
index 11a82aced9..f77b37f1cd 100644
--- a/lib/hash/compare_signatures_x86.h
+++ b/lib/hash/compare_signatures_x86.h
@@ -23,7 +23,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 
 	/* For match mask the first bit of every two bits indicates the match */
 	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_SSE:
 		/* Compare all signatures in the bucket */
 		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(_mm_load_si128(
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 4/7] hash: pack the hitmask for hash in bulk lookup
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
                     ` (2 preceding siblings ...)
  2024-07-08 12:14   ` [PATCH v12 3/7] hash: add a check on hash entry max size Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 5/7] hash: optimize compare signature for NEON Yoan Picchi
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations
The default non SIMD path now use this dense mask.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/compare_signatures_arm.h     |  47 ++++++----
 lib/hash/compare_signatures_generic.h |  31 ++++---
 lib/hash/compare_signatures_x86.h     |   9 +-
 lib/hash/rte_cuckoo_hash.c            | 124 +++++++++++++++++++-------
 4 files changed, 145 insertions(+), 66 deletions(-)
diff --git a/lib/hash/compare_signatures_arm.h b/lib/hash/compare_signatures_arm.h
index 74b3286c95..0fc657c49b 100644
--- a/lib/hash/compare_signatures_arm.h
+++ b/lib/hash/compare_signatures_arm.h
@@ -6,48 +6,57 @@
 #ifndef _COMPARE_SIGNATURE_ARM_PVT_H_
 #define _COMPARE_SIGNATURE_ARM_PVT_H_
 
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
 			enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-	unsigned int i;
 
-	/* For match mask the first bit of every two bits indicates the match */
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+		"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
 	switch (sig_cmp_fn) {
 #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
 		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
 
 		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
 		}
 		break;
 #endif
 	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 		}
 	}
 }
diff --git a/lib/hash/compare_signatures_generic.h b/lib/hash/compare_signatures_generic.h
index 43587adcef..1d065d4c28 100644
--- a/lib/hash/compare_signatures_generic.h
+++ b/lib/hash/compare_signatures_generic.h
@@ -6,27 +6,34 @@
 #ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_
 #define _COMPARE_SIGNATURE_GENERIC_PVT_H_
 
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 1
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
 			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
+			__rte_unused enum rte_hash_sig_compare_function sig_cmp_fn)
 {
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-		*prim_hash_matches |=
-			((sig == prim_bkt->sig_current[i]) << (i << 1));
-		*sec_hash_matches |=
-			((sig == sec_bkt->sig_current[i]) << (i << 1));
+
+	static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8),
+			"hitmask_buffer must be wide enough to fit a dense hitmask");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i;
+		*hitmask_buffer |= ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
 	}
 }
 
diff --git a/lib/hash/compare_signatures_x86.h b/lib/hash/compare_signatures_x86.h
index f77b37f1cd..03e9c44e53 100644
--- a/lib/hash/compare_signatures_x86.h
+++ b/lib/hash/compare_signatures_x86.h
@@ -6,14 +6,21 @@
 #ifndef _COMPARE_SIGNATURE_X86_PVT_H_
 #define _COMPARE_SIGNATURE_X86_PVT_H_
 
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
 #include <inttypes.h>
 #include <rte_common.h>
 #include <rte_vect.h>
 
 #include "rte_cuckoo_hash.h"
 
+#define DENSE_HASH_BULK_LOOKUP 0
+
 static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
 			const struct rte_hash_bucket *prim_bkt,
 			const struct rte_hash_bucket *sec_bkt,
 			uint16_t sig,
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index c3256dff4c..7512861aac 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -1908,22 +1908,41 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1934,10 +1953,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1951,10 +1970,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1976,13 +2003,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2005,7 +2032,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2055,11 +2082,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2073,14 +2109,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2091,10 +2139,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2155,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2136,13 +2192,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2169,7 +2225,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 5/7] hash: optimize compare signature for NEON
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
                     ` (3 preceding siblings ...)
  2024-07-08 12:14   ` [PATCH v12 4/7] hash: pack the hitmask for hash in bulk lookup Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 6/7] test/hash: check bulk lookup of keys after collision Yoan Picchi
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Ruifeng Wang, Nathan Brown
Upon a successful comparison, NEON sets all the bits in the lane to 1
We can skip shifting by simply masking with specific masks.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 lib/hash/compare_signatures_arm.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/lib/hash/compare_signatures_arm.h b/lib/hash/compare_signatures_arm.h
index 0fc657c49b..0245fec26f 100644
--- a/lib/hash/compare_signatures_arm.h
+++ b/lib/hash/compare_signatures_arm.h
@@ -34,21 +34,21 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 	switch (sig_cmp_fn) {
 #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8
 	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-		uint16_t low, high;
+		uint16x8_t vmat, hit1, hit2;
+		const uint16x8_t mask = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+		const uint16x8_t vsig = vld1q_dup_u16((uint16_t const *)&sig);
 
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
 		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		low = (uint16_t)(vaddvq_u16(x));
+		vmat = vceqq_u16(vsig, vld1q_u16(prim_bucket_sigs));
+		hit1 = vandq_u16(vmat, mask);
+
 		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
-		high = (uint16_t)(vaddvq_u16(x));
-		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+		vmat = vceqq_u16(vsig, vld1q_u16(sec_bucket_sigs));
+		hit2 = vandq_u16(vmat, mask);
 
+		hit2 = vshlq_n_u16(hit2, RTE_HASH_BUCKET_ENTRIES);
+		hit2 = vorrq_u16(hit1, hit2);
+		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
 #endif
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 6/7] test/hash: check bulk lookup of keys after collision
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
                     ` (4 preceding siblings ...)
  2024-07-08 12:14   ` [PATCH v12 5/7] hash: optimize compare signature for NEON Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-08 12:14   ` [PATCH v12 7/7] hash: add SVE support for bulk key lookup Yoan Picchi
  2024-07-09  4:48   ` [PATCH v12 0/7] " David Marchand
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Thomas Monjalon, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Ruifeng Wang, Nathan Brown
This patch adds unit test for rte_hash_lookup_bulk().
It also update the test_full_bucket test to the current number of entries
in a hash bucket.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap             |  1 +
 app/test/test_hash.c | 99 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 77 insertions(+), 23 deletions(-)
diff --git a/.mailmap b/.mailmap
index ec525981fe..41a8a99a7c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -505,6 +505,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
diff --git a/app/test/test_hash.c b/app/test/test_hash.c
index 24d3b547ad..ab3b37de3f 100644
--- a/app/test/test_hash.c
+++ b/app/test/test_hash.c
@@ -95,7 +95,7 @@ static uint32_t pseudo_hash(__rte_unused const void *keys,
 			    __rte_unused uint32_t key_len,
 			    __rte_unused uint32_t init_val)
 {
-	return 3;
+	return 3 | (3 << 16);
 }
 
 RTE_LOG_REGISTER(hash_logtype_test, test.hash, INFO);
@@ -115,8 +115,10 @@ static void print_key_info(const char *msg, const struct flow_key *key,
 	rte_log(RTE_LOG_DEBUG, hash_logtype_test, " @ pos %d\n", pos);
 }
 
+#define KEY_PER_BUCKET 8
+
 /* Keys used by unit test functions */
-static struct flow_key keys[5] = { {
+static struct flow_key keys[KEY_PER_BUCKET+1] = { {
 	.ip_src = RTE_IPV4(0x03, 0x02, 0x01, 0x00),
 	.ip_dst = RTE_IPV4(0x07, 0x06, 0x05, 0x04),
 	.port_src = 0x0908,
@@ -146,6 +148,30 @@ static struct flow_key keys[5] = { {
 	.port_src = 0x4948,
 	.port_dst = 0x4b4a,
 	.proto = 0x4c,
+}, {
+	.ip_src = RTE_IPV4(0x53, 0x52, 0x51, 0x50),
+	.ip_dst = RTE_IPV4(0x57, 0x56, 0x55, 0x54),
+	.port_src = 0x5958,
+	.port_dst = 0x5b5a,
+	.proto = 0x5c,
+}, {
+	.ip_src = RTE_IPV4(0x63, 0x62, 0x61, 0x60),
+	.ip_dst = RTE_IPV4(0x67, 0x66, 0x65, 0x64),
+	.port_src = 0x6968,
+	.port_dst = 0x6b6a,
+	.proto = 0x6c,
+}, {
+	.ip_src = RTE_IPV4(0x73, 0x72, 0x71, 0x70),
+	.ip_dst = RTE_IPV4(0x77, 0x76, 0x75, 0x74),
+	.port_src = 0x7978,
+	.port_dst = 0x7b7a,
+	.proto = 0x7c,
+}, {
+	.ip_src = RTE_IPV4(0x83, 0x82, 0x81, 0x80),
+	.ip_dst = RTE_IPV4(0x87, 0x86, 0x85, 0x84),
+	.port_src = 0x8988,
+	.port_dst = 0x8b8a,
+	.proto = 0x8c,
 } };
 
 /* Parameters used for hash table in unit test functions. Name set later. */
@@ -783,13 +809,15 @@ static int test_five_keys(void)
 
 /*
  * Add keys to the same bucket until bucket full.
- *	- add 5 keys to the same bucket (hash created with 4 keys per bucket):
- *	  first 4 successful, 5th successful, pushing existing item in bucket
- *	- lookup the 5 keys: 5 hits
- *	- add the 5 keys again: 5 OK
- *	- lookup the 5 keys: 5 hits (updated data)
- *	- delete the 5 keys: 5 OK
- *	- lookup the 5 keys: 5 misses
+ *	- add 9 keys to the same bucket (hash created with 8 keys per bucket):
+ *	  first 8 successful, 9th successful, pushing existing item in bucket
+ *	- lookup the 9 keys: 9 hits
+ *	- bulk lookup for all the 9 keys: 9 hits
+ *	- add the 9 keys again: 9 OK
+ *	- lookup the 9 keys: 9 hits (updated data)
+ *	- delete the 9 keys: 9 OK
+ *	- lookup the 9 keys: 9 misses
+ *	- bulk lookup for all the 9 keys: 9 misses
  */
 static int test_full_bucket(void)
 {
@@ -801,16 +829,17 @@ static int test_full_bucket(void)
 		.hash_func_init_val = 0,
 		.socket_id = 0,
 	};
+	const void *key_array[KEY_PER_BUCKET+1] = {0};
 	struct rte_hash *handle;
-	int pos[5];
-	int expected_pos[5];
+	int pos[KEY_PER_BUCKET+1];
+	int expected_pos[KEY_PER_BUCKET+1];
 	unsigned i;
-
+	int ret;
 	handle = rte_hash_create(¶ms_pseudo_hash);
 	RETURN_IF_ERROR(handle == NULL, "hash creation failed");
 
 	/* Fill bucket */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < KEY_PER_BUCKET; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] < 0,
@@ -821,22 +850,36 @@ static int test_full_bucket(void)
 	 * This should work and will push one of the items
 	 * in the bucket because it is full
 	 */
-	pos[4] = rte_hash_add_key(handle, &keys[4]);
-	print_key_info("Add", &keys[4], pos[4]);
-	RETURN_IF_ERROR(pos[4] < 0,
-			"failed to add key (pos[4]=%d)", pos[4]);
-	expected_pos[4] = pos[4];
+	pos[KEY_PER_BUCKET] = rte_hash_add_key(handle, &keys[KEY_PER_BUCKET]);
+	print_key_info("Add", &keys[KEY_PER_BUCKET], pos[KEY_PER_BUCKET]);
+	RETURN_IF_ERROR(pos[KEY_PER_BUCKET] < 0,
+			"failed to add key (pos[%d]=%d)", KEY_PER_BUCKET, pos[KEY_PER_BUCKET]);
+	expected_pos[KEY_PER_BUCKET] = pos[KEY_PER_BUCKET];
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
 			"failed to find key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	for (i = 0; i < KEY_PER_BUCKET+1; i++)
+		key_array[i] = &keys[i];
+
+	/*Bulk lookup after add with same hash*/
+	ret = rte_hash_lookup_bulk(handle, key_array, KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != expected_pos[i],
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
+
 	/* Add - update */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_add_key(handle, &keys[i]);
 		print_key_info("Add", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -844,7 +887,7 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -869,7 +912,7 @@ static int test_full_bucket(void)
 	RETURN_IF_ERROR(pos[1] < 0, "failed to add key (pos[1]=%d)", pos[1]);
 
 	/* Delete */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_del_key(handle, &keys[i]);
 		print_key_info("Del", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != expected_pos[i],
@@ -877,13 +920,23 @@ static int test_full_bucket(void)
 	}
 
 	/* Lookup */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
 		pos[i] = rte_hash_lookup(handle, &keys[i]);
 		print_key_info("Lkp", &keys[i], pos[i]);
 		RETURN_IF_ERROR(pos[i] != -ENOENT,
 			"fail: found non-existent key (pos[%u]=%d)", i, pos[i]);
 	}
 
+	/* Bulk Lookup on empty table*/
+	ret = rte_hash_lookup_bulk(handle, &key_array[0], KEY_PER_BUCKET+1, (int32_t *)pos);
+	RETURN_IF_ERROR(ret, "rte_hash_lookup_bulk returned an error: %d\n", ret);
+	for (i = 0; i < KEY_PER_BUCKET+1; i++) {
+		print_key_info("Blk_Lkp", key_array[i], pos[i]);
+		RETURN_IF_ERROR(pos[i] != -ENOENT,
+				"failed to find key (pos[%u]=%d)", i, pos[i]);
+	}
+
+
 	rte_hash_free(handle);
 
 	/* Cover the NULL case. */
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * [PATCH v12 7/7] hash: add SVE support for bulk key lookup
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
                     ` (5 preceding siblings ...)
  2024-07-08 12:14   ` [PATCH v12 6/7] test/hash: check bulk lookup of keys after collision Yoan Picchi
@ 2024-07-08 12:14   ` Yoan Picchi
  2024-07-09  4:48   ` [PATCH v12 0/7] " David Marchand
  7 siblings, 0 replies; 73+ messages in thread
From: Yoan Picchi @ 2024-07-08 12:14 UTC (permalink / raw)
  To: Yipeng Wang, Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin
  Cc: dev, nd, Yoan Picchi, Harjot Singh, Nathan Brown, Ruifeng Wang
- Implemented SVE code for comparing signatures in bulk lookup.
- New SVE code is ~5% slower than optimized NEON for N2 processor for
128b vectors.
Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Signed-off-by: Harjot Singh <harjot.singh@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/hash/compare_signatures_arm.h | 57 +++++++++++++++++++++++++++++++
 lib/hash/rte_cuckoo_hash.c        |  8 ++++-
 2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/lib/hash/compare_signatures_arm.h b/lib/hash/compare_signatures_arm.h
index 0245fec26f..86843b8a8a 100644
--- a/lib/hash/compare_signatures_arm.h
+++ b/lib/hash/compare_signatures_arm.h
@@ -51,6 +51,63 @@ compare_signatures_dense(uint16_t *hitmask_buffer,
 		*hitmask_buffer = vaddvq_u16(hit2);
 		}
 		break;
+#endif
+#if defined(RTE_HAS_SVE_ACLE)
+	case RTE_HASH_COMPARE_SVE: {
+		svuint16_t vsign, shift, sv_matches;
+		svbool_t pred, match, bucket_wide_pred;
+		int i = 0;
+		uint64_t vl = svcnth();
+
+		vsign = svdup_u16(sig);
+		shift = svindex_u16(0, 1);
+
+		if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) {
+			svuint16_t primary_array_vect, secondary_array_vect;
+			bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES);
+			primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs);
+			secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs);
+
+			/* We merged the two vectors so we can do both comparisons at once */
+			primary_array_vect = svsplice_u16(bucket_wide_pred, primary_array_vect,
+				secondary_array_vect);
+			pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES);
+
+			/* Compare all signatures in the buckets */
+			match = svcmpeq_u16(pred, vsign, primary_array_vect);
+			if (svptest_any(svptrue_b16(), match)) {
+				sv_matches = svdup_u16(1);
+				sv_matches = svlsl_u16_z(match, sv_matches, shift);
+				*hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches);
+			}
+		} else {
+			do {
+				pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES);
+				uint16_t lower_half = 0;
+				uint16_t upper_half = 0;
+				/* Compare all signatures in the primary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+					&prim_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					lower_half = svorv_u16(svptrue_b16(), sv_matches);
+				}
+				/* Compare all signatures in the secondary bucket */
+				match = svcmpeq_u16(pred, vsign, svld1_u16(pred,
+					&sec_bucket_sigs[i]));
+				if (svptest_any(svptrue_b16(), match)) {
+					sv_matches = svdup_u16(1);
+					sv_matches = svlsl_u16_z(match, sv_matches, shift);
+					upper_half = svorv_u16(svptrue_b16(), sv_matches)
+						<< RTE_HASH_BUCKET_ENTRIES;
+				}
+				hitmask_buffer[i / 8] = upper_half | lower_half;
+				i += vl;
+			} while (i < RTE_HASH_BUCKET_ENTRIES);
+		}
+		}
+		break;
 #endif
 	default:
 		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 7512861aac..ba4093a887 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -40,6 +40,7 @@ enum rte_hash_sig_compare_function {
 	RTE_HASH_COMPARE_SCALAR = 0,
 	RTE_HASH_COMPARE_SSE,
 	RTE_HASH_COMPARE_NEON,
+	RTE_HASH_COMPARE_SVE,
 	RTE_HASH_COMPARE_NUM
 };
 
@@ -461,8 +462,13 @@ rte_hash_create(const struct rte_hash_parameters *params)
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SSE;
 	else
 #elif defined(RTE_ARCH_ARM64)
-	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
 		h->sig_cmp_fn = RTE_HASH_COMPARE_NEON;
+#if defined(RTE_HAS_SVE_ACLE)
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE))
+			h->sig_cmp_fn = RTE_HASH_COMPARE_SVE;
+#endif
+	}
 	else
 #endif
 		h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR;
-- 
2.25.1
^ permalink raw reply	[flat|nested] 73+ messages in thread
- * Re: [PATCH v12 0/7] hash: add SVE support for bulk key lookup
  2024-07-08 12:14 ` [PATCH v12 0/7] " Yoan Picchi
                     ` (6 preceding siblings ...)
  2024-07-08 12:14   ` [PATCH v12 7/7] hash: add SVE support for bulk key lookup Yoan Picchi
@ 2024-07-09  4:48   ` David Marchand
  7 siblings, 0 replies; 73+ messages in thread
From: David Marchand @ 2024-07-09  4:48 UTC (permalink / raw)
  To: Yoan Picchi; +Cc: dev, nd
On Mon, Jul 8, 2024 at 2:14 PM Yoan Picchi <yoan.picchi@arm.com> wrote:
>
> This patchset adds SVE support for the signature comparison in the cuckoo
> hash lookup and improves the existing NEON implementation. These
> optimizations required changes to the data format and signature of the
> relevant functions to support dense hitmasks (no padding) and having the
> primary and secondary hitmasks interleaved instead of being in their own
> array each.
>
> Benchmarking the cuckoo hash perf test, I observed this effect on speed:
>   There are no significant changes on Intel (ran on Sapphire Rapids)
>   Neon is up to 7-10% faster (ran on ampere altra)
>   128b SVE is about 3-5% slower than the optimized neon (ran on a graviton
>     3 cloud instance)
>   256b SVE is about 0-3% slower than the optimized neon (ran on a graviton
>     3 cloud instance)
>
> V2->V3:
>   Remove a redundant if in the test
>   Change a couple int to uint16_t in compare_signatures_dense
>   Several codding-style fix
>
> V3->V4:
>   Rebase
>
> V4->V5:
>   Commit message
>
> V5->V6:
>   Move the arch-specific code into new arch-specific files
>   Isolate the data struture refactor from adding SVE
>
> V6->V7:
>   Commit message
>   Moved RTE_HASH_COMPARE_SVE to the last commit of the chain
>
> V7->V8:
>   Commit message
>   Typos and missing spaces
>
> V8->V9:
>   Use __rte_unused instead of (void)
>   Fix an indentation mistake
>
> V9->V10:
>   Fix more formating and indentation
>   Move the new compare signature file directly in hash instead of being
>     in a new subdir
>   Re-order includes
>   Remove duplicated static check
>   Move rte_hash_sig_compare_function's definition into a private header
>
> V10->V11:
>   Split the "pack the hitmask" commit into four commits:
>     Move the compare function enum out of the ABI
>     Move the compare function implementations into arch-specific files
>     Add a missing check on RTE_HASH_BUCKET_ENTRIES in case we change it
>       in the future
>     Implement the dense hitmask
>   Add missing header guards
>   Move compare function enum into cuckoo_hash.c instead of its own header.
>
> V11->V12:
>   Change the name of the compare function file (remove the _pvt suffix)
>
> Yoan Picchi (7):
>   hash: make compare signature function enum private
>   hash: split compare signature into arch-specific files
>   hash: add a check on hash entry max size
>   hash: pack the hitmask for hash in bulk lookup
>   hash: optimize compare signature for NEON
>   test/hash: check bulk lookup of keys after collision
>   hash: add SVE support for bulk key lookup
>
>  .mailmap                              |   2 +
>  app/test/test_hash.c                  |  99 +++++++++---
>  lib/hash/compare_signatures_arm.h     | 121 +++++++++++++++
>  lib/hash/compare_signatures_generic.h |  40 +++++
>  lib/hash/compare_signatures_x86.h     |  55 +++++++
>  lib/hash/rte_cuckoo_hash.c            | 207 ++++++++++++++------------
>  lib/hash/rte_cuckoo_hash.h            |  10 +-
>  7 files changed, 410 insertions(+), 124 deletions(-)
>  create mode 100644 lib/hash/compare_signatures_arm.h
>  create mode 100644 lib/hash/compare_signatures_generic.h
>  create mode 100644 lib/hash/compare_signatures_x86.h
I added RN updates, reformated commitlogs, fixed header guards and
removed some pvt leftover.
Series applied, thanks.
-- 
David Marchand
^ permalink raw reply	[flat|nested] 73+ messages in thread