DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation
@ 2021-09-06 16:03 Vladimir Medvedkin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 1/5] hash: add new toeplitz " Vladimir Medvedkin
                   ` (25 more replies)
  0 siblings, 26 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-09-06 16:03 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, andrey.chilikin, yipeng1.wang, sameh.gobriel,
	bruce.richardson

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

Vladimir Medvedkin (5):
  hash: add new toeplitz hash implementation
  hash: enable gfni thash implementation
  doc/hash: update documentation for the thash library
  test/thash: add tests for a new Toeplitz hash function
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 231 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 125 +++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  37 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   1 +
 lib/hash/rte_thash.c                        |  69 ++++++++-
 lib/hash/rte_thash.h                        |  41 +++++
 lib/hash/rte_thash_gfni.h                   | 229 +++++++++++++++++++++++++++
 lib/hash/version.map                        |   3 +
 11 files changed, 735 insertions(+), 8 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH 1/5] hash: add new toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-09-06 16:03 ` Vladimir Medvedkin
  2021-10-07 18:23   ` Ananyev, Konstantin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
                   ` (24 subsequent siblings)
  25 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-09-06 16:03 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, andrey.chilikin, yipeng1.wang, sameh.gobriel,
	bruce.richardson, john.mcnamara

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/api/doxy-api-index.md |   1 +
 lib/hash/meson.build      |   1 +
 lib/hash/rte_thash.c      |  26 ++++++
 lib/hash/rte_thash.h      |  22 +++++
 lib/hash/rte_thash_gfni.h | 229 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/hash/version.map      |   2 +
 6 files changed, 281 insertions(+)
 create mode 100644 lib/hash/rte_thash_gfni.h

diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 1992107..7549477 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -139,6 +139,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..40444ac 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,6 +7,7 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
 )
 indirect_headers += files('rte_crc_arm64.h')
 
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index d5a95a6..07447f7 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -11,6 +11,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_thash_gfni.h>
 
 #define THASH_NAME_LEN		64
 #define TOEPLITZ_HASH_LEN	32
@@ -88,6 +89,23 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+uint8_t rte_thash_gfni_supported;
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			m[i * 8 + j] = (rss_key[i] << j)|
+				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
+				(8 - j));
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
@@ -759,3 +777,11 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 
 	return ret;
 }
+
+RTE_INIT(rte_thash_gfni_init)
+{
+#ifdef __GFNI__
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI))
+		rte_thash_gfni_supported = 1;
+#endif
+}
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 76109fc..e3f1fc6 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -28,6 +28,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -113,6 +114,8 @@ union rte_thash_tuple {
 };
 #endif
 
+extern uint8_t rte_thash_gfni_supported;
+
 /**
  * Prepare special converted key to use with rte_softrss_be()
  * @param orig
@@ -223,6 +226,25 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrixes will be writen.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, uint8_t *rss_key, int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..8f89d7d
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(uint64_t *mtrx, uint8_t *tuple, uint8_t *secondary_tuple,
+	int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(uint64_t *m, uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+/**
+ * Calculate Toeplitz hash for two independent data buffers.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple_1
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param tuple_2
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val_1
+ *  Pointer to uint32_t where to put calculated Toeplitz hash value for
+ *  the first tuple.
+ * @param val_2
+ *  Pointer to uint32_t where to put calculated Toeplitz hash value for
+ *  the second tuple.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_x2(uint64_t *mtrx, uint8_t *tuple_1, uint8_t *tuple_2, int len,
+	uint32_t *val_1, uint32_t *val_2)
+{
+	__m512i xor_acc = __rte_thash_gfni(mtrx, tuple_1, tuple_2, len);
+	__rte_thash_xor_reduce(xor_acc, val_1, val_2);
+}
+
+#else /* __GFNI__ */
+
+static inline uint32_t
+rte_thash_gfni(uint64_t *mtrx __rte_unused, uint8_t *key __rte_unused,
+	int len __rte_unused)
+{
+	return 0;
+}
+
+static inline void
+rte_thash_gfni_x2(uint64_t *mtrx __rte_unused, uint8_t *tuple_1 __rte_unused,
+	uint8_t *tuple_2 __rte_unused, int len __rte_unused,
+	uint32_t *val_1 __rte_unused, uint32_t *val_2 __rte_unused)
+{
+
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index ce4309a..cecf922 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -39,10 +39,12 @@ EXPERIMENTAL {
 	rte_hash_rcu_qsbr_add;
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
+	rte_thash_complete_matrix;
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
 	rte_thash_get_helper;
 	rte_thash_get_key;
+	rte_thash_gfni_supported;
 	rte_thash_init_ctx;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 1/5] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-09-06 16:03 ` Vladimir Medvedkin
  2021-10-08 11:31   ` Ananyev, Konstantin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
                   ` (23 subsequent siblings)
  25 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-09-06 16:03 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, andrey.chilikin, yipeng1.wang, sameh.gobriel,
	bruce.richardson

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 lib/hash/rte_thash.c | 43 +++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 07447f7..86a0e96 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -86,6 +86,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< rte_thash_complete_matrix generated matrices */
 	uint8_t		hash_key[0];
 };
 
@@ -253,12 +255,25 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL)
+			goto free_ctx;
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -372,6 +387,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (rte_thash_gfni_supported)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -628,6 +647,16 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	if (rte_thash_gfni_supported)
+		return ctx->matrices;
+
+	rte_errno = ENOTSUP;
+	return NULL;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -739,11 +768,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (rte_thash_gfni_supported)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index e3f1fc6..6e6861c 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -410,6 +410,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index cecf922..3eda695 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -43,6 +43,7 @@ EXPERIMENTAL {
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
+	rte_thash_get_gfni_matrices;
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_gfni_supported;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH 3/5] doc/hash: update documentation for the thash library
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 1/5] hash: add new toeplitz " Vladimir Medvedkin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-09-06 16:03 ` Vladimir Medvedkin
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
                   ` (22 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-09-06 16:03 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, andrey.chilikin, yipeng1.wang, sameh.gobriel,
	bruce.richardson, john.mcnamara

This patch adds documentation for the new optimized Toeplitz hash
implementation using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
 doc/guides/rel_notes/release_21_11.rst      |  4 ++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..6f50a18 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
+* ``rte_thash_gfni_x2()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of abovementioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_x2()`` calculates for a two independent tuples in one go.
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrixes derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_gfni_x2()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* Two tuple pointers.
+* A length of the longest tuple in bytes.
+* Two pointers on the ``uint32_t`` to write results to.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index d707a55..df28642 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -55,6 +55,10 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (2 preceding siblings ...)
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
@ 2021-09-06 16:03 ` Vladimir Medvedkin
  2021-09-07  0:35   ` Stephen Hemminger
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
                   ` (21 subsequent siblings)
  25 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-09-06 16:03 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, andrey.chilikin, yipeng1.wang, sameh.gobriel,
	bruce.richardson

This patch provides a set of tests for verifying the new
implementation of Toeplitz hash function using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/test_thash.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..5327a02 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+uint8_t big_rss_key[] = {
+0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,204 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_X2_DATA_BUF_1_HASH_IDX,
+	GFNI_X2_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_x2(rss_key_matrixes,
+			(uint8_t *)data[0], (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t),
+			&hash[GFNI_X2_DATA_BUF_1_HASH_IDX],
+			&hash[GFNI_X2_DATA_BUF_2_HASH_IDX]);
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_X2_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_X2_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_toeplitz_hash_gfni_x2(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss_v4 = 0;
+	uint32_t rss_v6 = 0;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_x2(rss_key_matrixes, tuples[0], tuples[1],
+			RTE_THASH_V6_L4_LEN * 4, &rss_v4, &rss_v6);
+
+		if ((rss_v4 != v4_tbl[i].hash_l3l4) ||
+				(rss_v6 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +804,10 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_x2),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (3 preceding siblings ...)
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
@ 2021-09-06 16:03 ` Vladimir Medvedkin
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (20 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-09-06 16:03 UTC (permalink / raw)
  To: dev
  Cc: konstantin.ananyev, andrey.chilikin, yipeng1.wang, sameh.gobriel,
	bruce.richardson

This patch adds performance tests for different implementations
of the Toeplitz hash function.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 125 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index a761168..be5df32 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -140,6 +140,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -314,6 +315,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+	'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..ccc4710
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t hash_1 = 0;
+	uint32_t hash_2 = 0;
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported)
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j += 2) {
+			rte_thash_gfni_x2(rss_key_matrixes,
+				(uint8_t *)tuples[j], (uint8_t *)tuples[j + 1],
+				len, &hash_1, &hash_2);
+
+			hash ^= hash_1 ^ hash_2;
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
@ 2021-09-07  0:35   ` Stephen Hemminger
  2021-09-08 13:59     ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Stephen Hemminger @ 2021-09-07  0:35 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, konstantin.ananyev, andrey.chilikin, yipeng1.wang,
	sameh.gobriel, bruce.richardson

On Mon,  6 Sep 2021 17:03:58 +0100
Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:

> +uint8_t big_rss_key[] = {
> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +};

Table should be static const and indented.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-09-07  0:35   ` Stephen Hemminger
@ 2021-09-08 13:59     ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-09-08 13:59 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev, konstantin.ananyev, andrey.chilikin, yipeng1.wang,
	sameh.gobriel, bruce.richardson

Hi Stephen,


On 07/09/2021 02:35, Stephen Hemminger wrote:
> On Mon,  6 Sep 2021 17:03:58 +0100
> Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
> 
>> +uint8_t big_rss_key[] = {
>> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +};
> 
> Table should be static const and indented.
> 

Thanks for the review, I'll fix it in v2.

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] hash: add new toeplitz hash implementation
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 1/5] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-10-07 18:23   ` Ananyev, Konstantin
  2021-10-08 11:19     ` Ananyev, Konstantin
  2021-10-15  9:11     ` Medvedkin, Vladimir
  0 siblings, 2 replies; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-07 18:23 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson,
	Bruce, Mcnamara, John


> This patch add a new Toeplitz hash implementation using
> Galios Fields New Instructions (GFNI).
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  doc/api/doxy-api-index.md |   1 +
>  lib/hash/meson.build      |   1 +
>  lib/hash/rte_thash.c      |  26 ++++++
>  lib/hash/rte_thash.h      |  22 +++++
>  lib/hash/rte_thash_gfni.h | 229 ++++++++++++++++++++++++++++++++++++++++++++++
>  lib/hash/version.map      |   2 +
>  6 files changed, 281 insertions(+)
>  create mode 100644 lib/hash/rte_thash_gfni.h
> 
> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> index 1992107..7549477 100644
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -139,6 +139,7 @@ The public API headers are grouped by topics:
>    [hash]               (@ref rte_hash.h),
>    [jhash]              (@ref rte_jhash.h),
>    [thash]              (@ref rte_thash.h),
> +  [thash_gfni]         (@ref rte_thash_gfni.h),
>    [FBK hash]           (@ref rte_fbk_hash.h),
>    [CRC hash]           (@ref rte_hash_crc.h)
> 
> diff --git a/lib/hash/meson.build b/lib/hash/meson.build
> index 9bc5ef9..40444ac 100644
> --- a/lib/hash/meson.build
> +++ b/lib/hash/meson.build
> @@ -7,6 +7,7 @@ headers = files(
>          'rte_hash.h',
>          'rte_jhash.h',
>          'rte_thash.h',
> +        'rte_thash_gfni.h',
>  )
>  indirect_headers += files('rte_crc_arm64.h')
> 
> diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
> index d5a95a6..07447f7 100644
> --- a/lib/hash/rte_thash.c
> +++ b/lib/hash/rte_thash.c
> @@ -11,6 +11,7 @@
>  #include <rte_eal_memconfig.h>
>  #include <rte_log.h>
>  #include <rte_malloc.h>
> +#include <rte_thash_gfni.h>
> 
>  #define THASH_NAME_LEN		64
>  #define TOEPLITZ_HASH_LEN	32
> @@ -88,6 +89,23 @@ struct rte_thash_ctx {
>  	uint8_t		hash_key[0];
>  };
> 
> +uint8_t rte_thash_gfni_supported;

.. = 0;
?

> +
> +void
> +rte_thash_complete_matrix(uint64_t *matrixes, uint8_t *rss_key, int size)
> +{
> +	int i, j;
> +	uint8_t *m = (uint8_t *)matrixes;
> +
> +	for (i = 0; i < size; i++) {
> +		for (j = 0; j < 8; j++) {
> +			m[i * 8 + j] = (rss_key[i] << j)|
> +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
> +				(8 - j));
> +		}
> +	}
> +}
> +
>  static inline uint32_t
>  get_bit_lfsr(struct thash_lfsr *lfsr)
>  {
> @@ -759,3 +777,11 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
> 
>  	return ret;
>  }
> +
> +RTE_INIT(rte_thash_gfni_init)
> +{
> +#ifdef __GFNI__
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI))
> +		rte_thash_gfni_supported = 1;
> +#endif
> +}
> diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
> index 76109fc..e3f1fc6 100644
> --- a/lib/hash/rte_thash.h
> +++ b/lib/hash/rte_thash.h
> @@ -28,6 +28,7 @@ extern "C" {
>  #include <rte_config.h>
>  #include <rte_ip.h>
>  #include <rte_common.h>
> +#include <rte_thash_gfni.h>
> 
>  #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
>  #include <rte_vect.h>
> @@ -113,6 +114,8 @@ union rte_thash_tuple {
>  };
>  #endif
> 
> +extern uint8_t rte_thash_gfni_supported;
> +
>  /**
>   * Prepare special converted key to use with rte_softrss_be()
>   * @param orig
> @@ -223,6 +226,25 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
>  	return ret;
>  }
> 
> +/**
> + * Converts Toeplitz hash key (RSS key) into matrixes required
> + * for GFNI implementation
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param matrixes
> + *  pointer to the memory where matrixes will be writen.
> + *  Note: the size of this memory must be equal to size * 8
> + * @param rss_key
> + *  pointer to the Toeplitz hash key
> + * @param size
> + *  Size of the rss_key in bytes.
> + */
> +__rte_experimental
> +void
> +rte_thash_complete_matrix(uint64_t *matrixes, uint8_t *rss_key, int size);
> +
>  /** @internal Logarithm of minimum size of the RSS ReTa */
>  #define	RTE_THASH_RETA_SZ_MIN	2U
>  /** @internal Logarithm of maximum size of the RSS ReTa */
> diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
> new file mode 100644
> index 0000000..8f89d7d
> --- /dev/null
> +++ b/lib/hash/rte_thash_gfni.h
> @@ -0,0 +1,229 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Intel Corporation
> + */
> +
> +#ifndef _RTE_THASH_GFNI_H_
> +#define _RTE_THASH_GFNI_H_
> +
> +/**
> + * @file
> + *
> + * Optimized Toeplitz hash functions implementation
> + * using Galois Fields New Instructions.
> + */
> +
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#ifdef __GFNI__
> +
> +#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
> +#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
> +#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
> +#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
> +#define RTE_THASH_REWIND_MSK		0x0000000000113377
> +
> +__rte_internal
> +static inline void
> +__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
> +{
> +	__m256i tmp_256_1, tmp_256_2;
> +	__m128i tmp128_1, tmp128_2;
> +	uint64_t tmp_1, tmp_2;
> +
> +	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
> +	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
> +	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
> +
> +	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
> +	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
> +	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
> +
> +	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
> +	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
> +	tmp_1 ^= tmp_2;
> +
> +	*val_1 = (uint32_t)tmp_1;
> +	*val_2 = (uint32_t)(tmp_1 >> 32);
> +}
> +
> +__rte_internal
> +static inline __m512i
> +__rte_thash_gfni(uint64_t *mtrx, uint8_t *tuple, uint8_t *secondary_tuple,
> +	int len)

Here and in other fast-path functions:
const uint64_t  *mtrx

> +{
> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
> +						6, 5, 4, 3, 6, 5, 4, 3,
> +						5, 4, 3, 2, 5, 4, 3, 2,
> +						4, 3, 2, 1, 4, 3, 2, 1,
> +						3, 2, 1, 0, 3, 2, 1, 0,
> +						2, 1, 0, -1, 2, 1, 0, -1,
> +						1, 0, -1, -2, 1, 0, -1, -2,
> +						0, -1, -2, -3, 0, -1, -2, -3);
> +
> +	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 59, 0, 0, 0, 59,
> +						0, 0, 59, 58, 0, 0, 59, 58,
> +						0, 59, 58, 57, 0, 59, 58, 57);
> +	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
> +	const __m512i shift_8 = _mm512_set1_epi8(8);
> +	__m512i xor_acc = _mm512_setzero_si512();
> +	__m512i perm_bytes = _mm512_setzero_si512();
> +	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
> +	__mmask64 load_mask, permute_mask, permute_mask_2;
> +	int chunk_len = 0, i = 0;
> +	uint8_t mtrx_msk;
> +	const int prepend = 3;
> +
> +	for (; len > 0; len -= 64, tuple += 64) {

What will happen if len < 64?

> +		if (i == 8)
> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
> +				rewind_idx, perm_bytes);
> +
> +		permute_mask = RTE_THASH_FIRST_ITER_MSK;
> +		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
> +		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
> +		if (secondary_tuple) {
> +			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
> +			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
> +				secondary_tuple);
> +		}
> +
> +		chunk_len = __builtin_popcountll(load_mask);
> +		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
> +				permute_mask, permute_idx, tuple_bytes);
> +
> +			if (secondary_tuple)
> +				perm_bytes =
> +					_mm512_mask_permutexvar_epi8(perm_bytes,
> +					permute_mask_2, permute_idx,
> +					tuple_bytes_2);
> +
> +			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
> +			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
> +				matrixes, 0);
> +
> +			xor_acc = _mm512_xor_si512(xor_acc, vals);
> +			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
> +			permute_mask = RTE_THASH_PERM_MSK;
> +			if (secondary_tuple)
> +				permute_mask_2 = RTE_THASH_PERM_MSK_2;
> +		}
> +	}
> +
> +	int rest_len = (chunk_len + prepend) % 8;
> +	if (rest_len != 0) {
> +		mtrx_msk = (1 << (rest_len % 8)) - 1;
> +		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
> +		if (i == 8) {
> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
> +				rewind_idx, perm_bytes);
> +		} else {
> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
> +				permute_mask, permute_idx, tuple_bytes);
> +
> +			if (secondary_tuple)
> +				perm_bytes =
> +					_mm512_mask_permutexvar_epi8(
> +					perm_bytes, permute_mask_2,
> +					permute_idx, tuple_bytes_2);
> +		}
> +
> +		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
> +		xor_acc = _mm512_xor_si512(xor_acc, vals);
> +	}
> +
> +	return xor_acc;
> +}
> +
> +/**
> + * Calculate Toeplitz hash.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param m
> + *  Pointer to the matrices generated from the corresponding
> + *  RSS hash key using rte_thash_complete_matrix().
> + * @param tuple
> + *  Pointer to the data to be hashed. Data must be in network byte order.
> + * @param len
> + *  Length of the data to be hashed.
> + * @return
> + *  Calculated Toeplitz hash value.
> + */
> +__rte_experimental
> +static inline uint32_t
> +rte_thash_gfni(uint64_t *m, uint8_t *tuple, int len)
> +{
> +	uint32_t val, val_zero;
> +
> +	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
> +	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
> +
> +	return val;
> +}
> +
> +/**
> + * Calculate Toeplitz hash for two independent data buffers.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param m
> + *  Pointer to the matrices generated from the corresponding
> + *  RSS hash key using rte_thash_complete_matrix().
> + * @param tuple_1
> + *  Pointer to the data to be hashed. Data must be in network byte order.
> + * @param tuple_2
> + *  Pointer to the data to be hashed. Data must be in network byte order.
> + * @param len
> + *  Length of the largest data buffer to be hashed.
> + * @param val_1
> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
> + *  the first tuple.
> + * @param val_2
> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
> + *  the second tuple.
> + */
> +__rte_experimental
> +static inline void
> +rte_thash_gfni_x2(uint64_t *mtrx, uint8_t *tuple_1, uint8_t *tuple_2, int len,
> +	uint32_t *val_1, uint32_t *val_2)

Why just two?
Why not uint8_t *tuple[]
?

> +{
> +	__m512i xor_acc = __rte_thash_gfni(mtrx, tuple_1, tuple_2, len);
> +	__rte_thash_xor_reduce(xor_acc, val_1, val_2);
> +}
> +
> +#else /* __GFNI__ */
> +
> +static inline uint32_t
> +rte_thash_gfni(uint64_t *mtrx __rte_unused, uint8_t *key __rte_unused,
> +	int len __rte_unused)
> +{
> +	return 0;
> +}
> +
> +static inline void
> +rte_thash_gfni_x2(uint64_t *mtrx __rte_unused, uint8_t *tuple_1 __rte_unused,
> +	uint8_t *tuple_2 __rte_unused, int len __rte_unused,
> +	uint32_t *val_1 __rte_unused, uint32_t *val_2 __rte_unused)
> +{
> +

That seems inconsistent with dummy rte_thash_gfni() above.
Should be:
*val_1  = 0; *val_2 = 0; 
I think.

> +}
> +
> +#endif
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_THASH_GFNI_H_ */
> diff --git a/lib/hash/version.map b/lib/hash/version.map
> index ce4309a..cecf922 100644
> --- a/lib/hash/version.map
> +++ b/lib/hash/version.map
> @@ -39,10 +39,12 @@ EXPERIMENTAL {
>  	rte_hash_rcu_qsbr_add;
>  	rte_thash_add_helper;
>  	rte_thash_adjust_tuple;
> +	rte_thash_complete_matrix;
>  	rte_thash_find_existing;
>  	rte_thash_free_ctx;
>  	rte_thash_get_complement;
>  	rte_thash_get_helper;
>  	rte_thash_get_key;
> +	rte_thash_gfni_supported;
>  	rte_thash_init_ctx;
>  };
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] hash: add new toeplitz hash implementation
  2021-10-07 18:23   ` Ananyev, Konstantin
@ 2021-10-08 11:19     ` Ananyev, Konstantin
  2021-10-15  9:11     ` Medvedkin, Vladimir
  1 sibling, 0 replies; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-08 11:19 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson,
	Bruce, Mcnamara, John


> >
> > +uint8_t rte_thash_gfni_supported;
> 
> .. = 0;
> ?
> 

Also some comment with explanation would really be good here.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-08 11:31   ` Ananyev, Konstantin
  2021-10-15  9:13     ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-08 11:31 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce


> This patch enables new GFNI Toeplitz hash in
> predictable RSS library.
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  lib/hash/rte_thash.c | 43 +++++++++++++++++++++++++++++++++++++++----
>  lib/hash/rte_thash.h | 19 +++++++++++++++++++
>  lib/hash/version.map |  1 +
>  3 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
> index 07447f7..86a0e96 100644
> --- a/lib/hash/rte_thash.c
> +++ b/lib/hash/rte_thash.c
> @@ -86,6 +86,8 @@ struct rte_thash_ctx {
>  	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
>  	uint32_t	subtuples_nb;	/** < number of subtuples */
>  	uint32_t	flags;
> +	uint64_t	*matrices;

Comment, what is that, etc.

> +	/**< rte_thash_complete_matrix generated matrices */
>  	uint8_t		hash_key[0];
>  };
> 
> @@ -253,12 +255,25 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
>  			ctx->hash_key[i] = rte_rand();
>  	}
> 
> +	if (rte_thash_gfni_supported) {

I think it should be:
if (rte_thash_gfni_supported && rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)


> +		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
> +			RTE_CACHE_LINE_SIZE);

You can do it probably before allocation ctx, at the same place where te is allocated.
Might be a bit nicer.

> +		if (ctx->matrices == NULL)

		RTE_LOG(ERR, ...);
		rte_ernno = ENOMEM;

> +			goto free_ctx;
> +
> +		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
> +			key_len);
> +	}
> +
>  	te->data = (void *)ctx;
>  	TAILQ_INSERT_TAIL(thash_list, te, next);
> 
>  	rte_mcfg_tailq_write_unlock();
> 
>  	return ctx;
> +
> +free_ctx:
> +	rte_free(ctx);
>  free_te:
>  	rte_free(te);
>  exit:
> @@ -372,6 +387,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
>  			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
>  	}
> 
> +	if (rte_thash_gfni_supported)

Here and in data-path functions, I think it would be better:
if (ctx->matrices != NULL)
> +		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
> +			ctx->key_len);
> +
>  	return 0;
>  }
> 
> @@ -628,6 +647,16 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
>  	return ctx->hash_key;
>  }
> 
> +const uint64_t *
> +rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
> +{
> +	if (rte_thash_gfni_supported)
> +		return ctx->matrices;

Why not just always:
return ctx->matices;
?

> +
> +	rte_errno = ENOTSUP;
> +	return NULL;
> +}
> +
>  static inline uint8_t
>  read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
>  {
> @@ -739,11 +768,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
>  	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
> 
>  	for (i = 0; i < attempts; i++) {
> -		for (j = 0; j < (tuple_len / 4); j++)
> -			tmp_tuple[j] =
> -				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
> +		if (rte_thash_gfni_supported)
if (ctx->matrices)

> +			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
> +		else {
> +			for (j = 0; j < (tuple_len / 4); j++)
> +				tmp_tuple[j] =
> +					rte_be_to_cpu_32(
> +						*(uint32_t *)&tuple[j * 4]);
> +
> +			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
> +		}
> 
> -		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
>  		adj_bits = rte_thash_get_complement(h, hash, desired_value);
> 
>  		/*
> diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
> index e3f1fc6..6e6861c 100644
> --- a/lib/hash/rte_thash.h
> +++ b/lib/hash/rte_thash.h
> @@ -410,6 +410,25 @@ const uint8_t *
>  rte_thash_get_key(struct rte_thash_ctx *ctx);
> 
>  /**
> + * Get a pointer to the toeplitz hash matrices contained in the context.
> + * These matrices could be used with fast toeplitz hash implementation if
> + * CPU supports GFNI.
> + * Matrices changes after each addition of a helper.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param ctx
> + *  Thash context
> + * @return
> + *  A pointer to the toeplitz hash key matrices on success
> + *  NULL if GFNI is not supported.
> + */
> +__rte_experimental
> +const uint64_t *
> +rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
> +
> +/**
>   * Function prototype for the rte_thash_adjust_tuple
>   * to check if adjusted tuple could be used.
>   * Generally it is some kind of lookup function to check
> diff --git a/lib/hash/version.map b/lib/hash/version.map
> index cecf922..3eda695 100644
> --- a/lib/hash/version.map
> +++ b/lib/hash/version.map
> @@ -43,6 +43,7 @@ EXPERIMENTAL {
>  	rte_thash_find_existing;
>  	rte_thash_free_ctx;
>  	rte_thash_get_complement;
> +	rte_thash_get_gfni_matrices;
>  	rte_thash_get_helper;
>  	rte_thash_get_key;
>  	rte_thash_gfni_supported;
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] hash: add new toeplitz hash implementation
  2021-10-07 18:23   ` Ananyev, Konstantin
  2021-10-08 11:19     ` Ananyev, Konstantin
@ 2021-10-15  9:11     ` Medvedkin, Vladimir
  2021-10-15 10:55       ` Ananyev, Konstantin
  1 sibling, 1 reply; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-15  9:11 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson,
	Bruce, Mcnamara, John

Hi Konstantin,

Thanks for the review,

On 07/10/2021 20:23, Ananyev, Konstantin wrote:
> 
>> This patch add a new Toeplitz hash implementation using
>> Galios Fields New Instructions (GFNI).
>>
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>> ---
>>   doc/api/doxy-api-index.md |   1 +
>>   lib/hash/meson.build      |   1 +
>>   lib/hash/rte_thash.c      |  26 ++++++
>>   lib/hash/rte_thash.h      |  22 +++++
>>   lib/hash/rte_thash_gfni.h | 229 ++++++++++++++++++++++++++++++++++++++++++++++
>>   lib/hash/version.map      |   2 +
>>   6 files changed, 281 insertions(+)
>>   create mode 100644 lib/hash/rte_thash_gfni.h
>>
>> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
>> index 1992107..7549477 100644
>> --- a/doc/api/doxy-api-index.md
>> +++ b/doc/api/doxy-api-index.md
>> @@ -139,6 +139,7 @@ The public API headers are grouped by topics:
>>     [hash]               (@ref rte_hash.h),
>>     [jhash]              (@ref rte_jhash.h),
>>     [thash]              (@ref rte_thash.h),
>> +  [thash_gfni]         (@ref rte_thash_gfni.h),
>>     [FBK hash]           (@ref rte_fbk_hash.h),
>>     [CRC hash]           (@ref rte_hash_crc.h)
>>
>> diff --git a/lib/hash/meson.build b/lib/hash/meson.build
>> index 9bc5ef9..40444ac 100644
>> --- a/lib/hash/meson.build
>> +++ b/lib/hash/meson.build
>> @@ -7,6 +7,7 @@ headers = files(
>>           'rte_hash.h',
>>           'rte_jhash.h',
>>           'rte_thash.h',
>> +        'rte_thash_gfni.h',
>>   )
>>   indirect_headers += files('rte_crc_arm64.h')
>>
>> diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
>> index d5a95a6..07447f7 100644
>> --- a/lib/hash/rte_thash.c
>> +++ b/lib/hash/rte_thash.c
>> @@ -11,6 +11,7 @@
>>   #include <rte_eal_memconfig.h>
>>   #include <rte_log.h>
>>   #include <rte_malloc.h>
>> +#include <rte_thash_gfni.h>
>>
>>   #define THASH_NAME_LEN		64
>>   #define TOEPLITZ_HASH_LEN	32
>> @@ -88,6 +89,23 @@ struct rte_thash_ctx {
>>   	uint8_t		hash_key[0];
>>   };
>>
>> +uint8_t rte_thash_gfni_supported;
> 
> .. = 0;
> ?
> 

This goes against style:
ERROR:GLOBAL_INITIALISERS: do not initialise globals to 0
I'll init it inside the RTE_INIT section

>> +
>> +void
>> +rte_thash_complete_matrix(uint64_t *matrixes, uint8_t *rss_key, int size)
>> +{
>> +	int i, j;
>> +	uint8_t *m = (uint8_t *)matrixes;
>> +
>> +	for (i = 0; i < size; i++) {
>> +		for (j = 0; j < 8; j++) {
>> +			m[i * 8 + j] = (rss_key[i] << j)|
>> +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
>> +				(8 - j));
>> +		}
>> +	}
>> +}
>> +
>>   static inline uint32_t
>>   get_bit_lfsr(struct thash_lfsr *lfsr)
>>   {
>> @@ -759,3 +777,11 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
>>
>>   	return ret;
>>   }
>> +
>> +RTE_INIT(rte_thash_gfni_init)
>> +{
>> +#ifdef __GFNI__
>> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI))
>> +		rte_thash_gfni_supported = 1;
>> +#endif
>> +}
>> diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
>> index 76109fc..e3f1fc6 100644
>> --- a/lib/hash/rte_thash.h
>> +++ b/lib/hash/rte_thash.h
>> @@ -28,6 +28,7 @@ extern "C" {
>>   #include <rte_config.h>
>>   #include <rte_ip.h>
>>   #include <rte_common.h>
>> +#include <rte_thash_gfni.h>
>>
>>   #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
>>   #include <rte_vect.h>
>> @@ -113,6 +114,8 @@ union rte_thash_tuple {
>>   };
>>   #endif
>>
>> +extern uint8_t rte_thash_gfni_supported;
>> +
>>   /**
>>    * Prepare special converted key to use with rte_softrss_be()
>>    * @param orig
>> @@ -223,6 +226,25 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
>>   	return ret;
>>   }
>>
>> +/**
>> + * Converts Toeplitz hash key (RSS key) into matrixes required
>> + * for GFNI implementation
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param matrixes
>> + *  pointer to the memory where matrixes will be writen.
>> + *  Note: the size of this memory must be equal to size * 8
>> + * @param rss_key
>> + *  pointer to the Toeplitz hash key
>> + * @param size
>> + *  Size of the rss_key in bytes.
>> + */
>> +__rte_experimental
>> +void
>> +rte_thash_complete_matrix(uint64_t *matrixes, uint8_t *rss_key, int size);
>> +
>>   /** @internal Logarithm of minimum size of the RSS ReTa */
>>   #define	RTE_THASH_RETA_SZ_MIN	2U
>>   /** @internal Logarithm of maximum size of the RSS ReTa */
>> diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
>> new file mode 100644
>> index 0000000..8f89d7d
>> --- /dev/null
>> +++ b/lib/hash/rte_thash_gfni.h
>> @@ -0,0 +1,229 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2021 Intel Corporation
>> + */
>> +
>> +#ifndef _RTE_THASH_GFNI_H_
>> +#define _RTE_THASH_GFNI_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * Optimized Toeplitz hash functions implementation
>> + * using Galois Fields New Instructions.
>> + */
>> +
>> +#include <rte_vect.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#ifdef __GFNI__
>> +
>> +#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
>> +#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
>> +#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
>> +#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
>> +#define RTE_THASH_REWIND_MSK		0x0000000000113377
>> +
>> +__rte_internal
>> +static inline void
>> +__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
>> +{
>> +	__m256i tmp_256_1, tmp_256_2;
>> +	__m128i tmp128_1, tmp128_2;
>> +	uint64_t tmp_1, tmp_2;
>> +
>> +	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
>> +	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
>> +	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
>> +
>> +	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
>> +	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
>> +	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
>> +
>> +	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
>> +	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
>> +	tmp_1 ^= tmp_2;
>> +
>> +	*val_1 = (uint32_t)tmp_1;
>> +	*val_2 = (uint32_t)(tmp_1 >> 32);
>> +}
>> +
>> +__rte_internal
>> +static inline __m512i
>> +__rte_thash_gfni(uint64_t *mtrx, uint8_t *tuple, uint8_t *secondary_tuple,
>> +	int len)
> 
> Here and in other fast-path functions:
> const uint64_t  *mtrx
> 

Agree

>> +{
>> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
>> +						6, 5, 4, 3, 6, 5, 4, 3,
>> +						5, 4, 3, 2, 5, 4, 3, 2,
>> +						4, 3, 2, 1, 4, 3, 2, 1,
>> +						3, 2, 1, 0, 3, 2, 1, 0,
>> +						2, 1, 0, -1, 2, 1, 0, -1,
>> +						1, 0, -1, -2, 1, 0, -1, -2,
>> +						0, -1, -2, -3, 0, -1, -2, -3);
>> +
>> +	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 59, 0, 0, 0, 59,
>> +						0, 0, 59, 58, 0, 0, 59, 58,
>> +						0, 59, 58, 57, 0, 59, 58, 57);
>> +	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
>> +	const __m512i shift_8 = _mm512_set1_epi8(8);
>> +	__m512i xor_acc = _mm512_setzero_si512();
>> +	__m512i perm_bytes = _mm512_setzero_si512();
>> +	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
>> +	__mmask64 load_mask, permute_mask, permute_mask_2;
>> +	int chunk_len = 0, i = 0;
>> +	uint8_t mtrx_msk;
>> +	const int prepend = 3;
>> +
>> +	for (; len > 0; len -= 64, tuple += 64) {
> 
> What will happen if len < 64?
> 

If len < 64 then only necessary number of bytes will be loaded into the 
ZMM register (see load_mask). After, internal loop with actual 
calculations will be executed for all loaded bytes.

>> +		if (i == 8)
>> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
>> +				rewind_idx, perm_bytes);
>> +
>> +		permute_mask = RTE_THASH_FIRST_ITER_MSK;
>> +		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
>> +		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
>> +		if (secondary_tuple) {
>> +			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
>> +			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
>> +				secondary_tuple);
>> +		}
>> +
>> +		chunk_len = __builtin_popcountll(load_mask);
>> +		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
>> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
>> +				permute_mask, permute_idx, tuple_bytes);
>> +
>> +			if (secondary_tuple)
>> +				perm_bytes =
>> +					_mm512_mask_permutexvar_epi8(perm_bytes,
>> +					permute_mask_2, permute_idx,
>> +					tuple_bytes_2);
>> +
>> +			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
>> +			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
>> +				matrixes, 0);
>> +
>> +			xor_acc = _mm512_xor_si512(xor_acc, vals);
>> +			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
>> +			permute_mask = RTE_THASH_PERM_MSK;
>> +			if (secondary_tuple)
>> +				permute_mask_2 = RTE_THASH_PERM_MSK_2;
>> +		}
>> +	}
>> +
>> +	int rest_len = (chunk_len + prepend) % 8;
>> +	if (rest_len != 0) {
>> +		mtrx_msk = (1 << (rest_len % 8)) - 1;
>> +		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
>> +		if (i == 8) {
>> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
>> +				rewind_idx, perm_bytes);
>> +		} else {
>> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
>> +				permute_mask, permute_idx, tuple_bytes);
>> +
>> +			if (secondary_tuple)
>> +				perm_bytes =
>> +					_mm512_mask_permutexvar_epi8(
>> +					perm_bytes, permute_mask_2,
>> +					permute_idx, tuple_bytes_2);
>> +		}
>> +
>> +		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
>> +		xor_acc = _mm512_xor_si512(xor_acc, vals);
>> +	}
>> +
>> +	return xor_acc;
>> +}
>> +
>> +/**
>> + * Calculate Toeplitz hash.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param m
>> + *  Pointer to the matrices generated from the corresponding
>> + *  RSS hash key using rte_thash_complete_matrix().
>> + * @param tuple
>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>> + * @param len
>> + *  Length of the data to be hashed.
>> + * @return
>> + *  Calculated Toeplitz hash value.
>> + */
>> +__rte_experimental
>> +static inline uint32_t
>> +rte_thash_gfni(uint64_t *m, uint8_t *tuple, int len)
>> +{
>> +	uint32_t val, val_zero;
>> +
>> +	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
>> +	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
>> +
>> +	return val;
>> +}
>> +
>> +/**
>> + * Calculate Toeplitz hash for two independent data buffers.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param m
>> + *  Pointer to the matrices generated from the corresponding
>> + *  RSS hash key using rte_thash_complete_matrix().
>> + * @param tuple_1
>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>> + * @param tuple_2
>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>> + * @param len
>> + *  Length of the largest data buffer to be hashed.
>> + * @param val_1
>> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
>> + *  the first tuple.
>> + * @param val_2
>> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
>> + *  the second tuple.
>> + */
>> +__rte_experimental
>> +static inline void
>> +rte_thash_gfni_x2(uint64_t *mtrx, uint8_t *tuple_1, uint8_t *tuple_2, int len,
>> +	uint32_t *val_1, uint32_t *val_2)
> 
> Why just two?
> Why not uint8_t *tuple[]
> ?
> 

x2 version was added because there was unused space inside the ZMM which 
holds input key (input tuple) bytes for a second input key, so it helps 
to improve performance in some cases.
Bulk version wasn't added because for the vast majority of cases it will 
be used with a single input key.
Hiding this function inside .c will greatly affect performance, because 
it takes just a few cycles to calculate the hash for the most popular 
key sizes.

>> +{
>> +	__m512i xor_acc = __rte_thash_gfni(mtrx, tuple_1, tuple_2, len);
>> +	__rte_thash_xor_reduce(xor_acc, val_1, val_2);
>> +}
>> +
>> +#else /* __GFNI__ */
>> +
>> +static inline uint32_t
>> +rte_thash_gfni(uint64_t *mtrx __rte_unused, uint8_t *key __rte_unused,
>> +	int len __rte_unused)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void
>> +rte_thash_gfni_x2(uint64_t *mtrx __rte_unused, uint8_t *tuple_1 __rte_unused,
>> +	uint8_t *tuple_2 __rte_unused, int len __rte_unused,
>> +	uint32_t *val_1 __rte_unused, uint32_t *val_2 __rte_unused)
>> +{
>> +
> 
> That seems inconsistent with dummy rte_thash_gfni() above.
> Should be:
> *val_1  = 0; *val_2 = 0;
> I think.
> 

Agree

>> +}
>> +
>> +#endif
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_THASH_GFNI_H_ */
>> diff --git a/lib/hash/version.map b/lib/hash/version.map
>> index ce4309a..cecf922 100644
>> --- a/lib/hash/version.map
>> +++ b/lib/hash/version.map
>> @@ -39,10 +39,12 @@ EXPERIMENTAL {
>>   	rte_hash_rcu_qsbr_add;
>>   	rte_thash_add_helper;
>>   	rte_thash_adjust_tuple;
>> +	rte_thash_complete_matrix;
>>   	rte_thash_find_existing;
>>   	rte_thash_free_ctx;
>>   	rte_thash_get_complement;
>>   	rte_thash_get_helper;
>>   	rte_thash_get_key;
>> +	rte_thash_gfni_supported;
>>   	rte_thash_init_ctx;
>>   };
>> --
>> 2.7.4
> 

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation
  2021-10-08 11:31   ` Ananyev, Konstantin
@ 2021-10-15  9:13     ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-15  9:13 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce

Hi Konstantin,

On 08/10/2021 13:31, Ananyev, Konstantin wrote:
> 
>> This patch enables new GFNI Toeplitz hash in
>> predictable RSS library.
>>
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>> ---
>>   lib/hash/rte_thash.c | 43 +++++++++++++++++++++++++++++++++++++++----
>>   lib/hash/rte_thash.h | 19 +++++++++++++++++++
>>   lib/hash/version.map |  1 +
>>   3 files changed, 59 insertions(+), 4 deletions(-)
>>
>> diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
>> index 07447f7..86a0e96 100644
>> --- a/lib/hash/rte_thash.c
>> +++ b/lib/hash/rte_thash.c
>> @@ -86,6 +86,8 @@ struct rte_thash_ctx {
>>   	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
>>   	uint32_t	subtuples_nb;	/** < number of subtuples */
>>   	uint32_t	flags;
>> +	uint64_t	*matrices;
> 
> Comment, what is that, etc.
> 

I'll rephrase the comment below.

>> +	/**< rte_thash_complete_matrix generated matrices */
>>   	uint8_t		hash_key[0];
>>   };
>>
>> @@ -253,12 +255,25 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
>>   			ctx->hash_key[i] = rte_rand();
>>   	}
>>
>> +	if (rte_thash_gfni_supported) {
> 
> I think it should be:
> if (rte_thash_gfni_supported && rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
> 
> 

Agree

>> +		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
>> +			RTE_CACHE_LINE_SIZE);
> 
> You can do it probably before allocation ctx, at the same place where te is allocated.
> Might be a bit nicer.
> 

I'd prefer to keep allocation and initialization of matrices in one 
place, below there is rte_thash_complete_matrix() which uses previously 
generated ctx->hash_key.

>> +		if (ctx->matrices == NULL)
> 
> 		RTE_LOG(ERR, ...);
> 		rte_ernno = ENOMEM;
> 

Agree

>> +			goto free_ctx;
>> +
>> +		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
>> +			key_len);
>> +	}
>> +
>>   	te->data = (void *)ctx;
>>   	TAILQ_INSERT_TAIL(thash_list, te, next);
>>
>>   	rte_mcfg_tailq_write_unlock();
>>
>>   	return ctx;
>> +
>> +free_ctx:
>> +	rte_free(ctx);
>>   free_te:
>>   	rte_free(te);
>>   exit:
>> @@ -372,6 +387,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
>>   			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
>>   	}
>>
>> +	if (rte_thash_gfni_supported)
> 
> Here and in data-path functions, I think it would be better:
> if (ctx->matrices != NULL)

Agree

>> +		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
>> +			ctx->key_len);
>> +
>>   	return 0;
>>   }
>>
>> @@ -628,6 +647,16 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
>>   	return ctx->hash_key;
>>   }
>>
>> +const uint64_t *
>> +rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
>> +{
>> +	if (rte_thash_gfni_supported)
>> +		return ctx->matrices;
> 
> Why not just always:
> return ctx->matices;
> ?
> 

Agree

>> +
>> +	rte_errno = ENOTSUP;
>> +	return NULL;
>> +}
>> +
>>   static inline uint8_t
>>   read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
>>   {
>> @@ -739,11 +768,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
>>   	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
>>
>>   	for (i = 0; i < attempts; i++) {
>> -		for (j = 0; j < (tuple_len / 4); j++)
>> -			tmp_tuple[j] =
>> -				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
>> +		if (rte_thash_gfni_supported)
> if (ctx->matrices)
> 
>> +			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
>> +		else {
>> +			for (j = 0; j < (tuple_len / 4); j++)
>> +				tmp_tuple[j] =
>> +					rte_be_to_cpu_32(
>> +						*(uint32_t *)&tuple[j * 4]);
>> +
>> +			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
>> +		}
>>
>> -		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
>>   		adj_bits = rte_thash_get_complement(h, hash, desired_value);
>>
>>   		/*
>> diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
>> index e3f1fc6..6e6861c 100644
>> --- a/lib/hash/rte_thash.h
>> +++ b/lib/hash/rte_thash.h
>> @@ -410,6 +410,25 @@ const uint8_t *
>>   rte_thash_get_key(struct rte_thash_ctx *ctx);
>>
>>   /**
>> + * Get a pointer to the toeplitz hash matrices contained in the context.
>> + * These matrices could be used with fast toeplitz hash implementation if
>> + * CPU supports GFNI.
>> + * Matrices changes after each addition of a helper.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param ctx
>> + *  Thash context
>> + * @return
>> + *  A pointer to the toeplitz hash key matrices on success
>> + *  NULL if GFNI is not supported.
>> + */
>> +__rte_experimental
>> +const uint64_t *
>> +rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
>> +
>> +/**
>>    * Function prototype for the rte_thash_adjust_tuple
>>    * to check if adjusted tuple could be used.
>>    * Generally it is some kind of lookup function to check
>> diff --git a/lib/hash/version.map b/lib/hash/version.map
>> index cecf922..3eda695 100644
>> --- a/lib/hash/version.map
>> +++ b/lib/hash/version.map
>> @@ -43,6 +43,7 @@ EXPERIMENTAL {
>>   	rte_thash_find_existing;
>>   	rte_thash_free_ctx;
>>   	rte_thash_get_complement;
>> +	rte_thash_get_gfni_matrices;
>>   	rte_thash_get_helper;
>>   	rte_thash_get_key;
>>   	rte_thash_gfni_supported;
>> --
>> 2.7.4
> 

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (4 preceding siblings ...)
  2021-09-06 16:03 ` [dpdk-dev] [PATCH 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
@ 2021-10-15  9:30 ` Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
                     ` (5 more replies)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
                   ` (19 subsequent siblings)
  25 siblings, 6 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-15  9:30 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (5):
  hash: add new toeplitz hash implementation
  hash: enable gfni thash implementation
  doc/hash: update documentation for the thash library
  test/thash: add tests for a new Toeplitz hash function
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 231 +++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 125 +++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  37 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   1 +
 lib/hash/rte_thash.c                        |  72 ++++++++-
 lib/hash/rte_thash.h                        |  43 ++++++
 lib/hash/rte_thash_gfni.h                   | 232 ++++++++++++++++++++++++++++
 lib/hash/version.map                        |   3 +
 11 files changed, 743 insertions(+), 8 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (5 preceding siblings ...)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-15  9:30 ` Vladimir Medvedkin
  2021-10-15 16:58   ` Stephen Hemminger
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
                   ` (18 subsequent siblings)
  25 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-15  9:30 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/api/doxy-api-index.md |   1 +
 lib/hash/meson.build      |   1 +
 lib/hash/rte_thash.c      |  28 ++++++
 lib/hash/rte_thash.h      |  24 +++++
 lib/hash/rte_thash_gfni.h | 232 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/hash/version.map      |   2 +
 6 files changed, 288 insertions(+)
 create mode 100644 lib/hash/rte_thash_gfni.h

diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 1992107..7549477 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -139,6 +139,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..40444ac 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,6 +7,7 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
 )
 indirect_headers += files('rte_crc_arm64.h')
 
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..59a8b8e 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -13,6 +13,7 @@
 #include <rte_eal_memconfig.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
+#include <rte_thash_gfni.h>
 
 #define THASH_NAME_LEN		64
 #define TOEPLITZ_HASH_LEN	32
@@ -90,6 +91,24 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+/** Flag indicating GFNI support */
+uint8_t rte_thash_gfni_supported;
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			m[i * 8 + j] = (rss_key[i] << j)|
+				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
+				(8 - j));
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
@@ -761,3 +780,12 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 
 	return ret;
 }
+
+RTE_INIT(rte_thash_gfni_init)
+{
+	rte_thash_gfni_supported = 0;
+#ifdef __GFNI__
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI))
+		rte_thash_gfni_supported = 1;
+#endif
+}
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 76109fc..e4f14a5 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -28,6 +28,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -113,6 +114,9 @@ union rte_thash_tuple {
 };
 #endif
 
+/** Flag indicating GFNI support */
+extern uint8_t rte_thash_gfni_supported;
+
 /**
  * Prepare special converted key to use with rte_softrss_be()
  * @param orig
@@ -223,6 +227,26 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..2e5de0d
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+/**
+ * Calculate Toeplitz hash for two independent data buffers.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple_1
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param tuple_2
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val_1
+ *  Pointer to uint32_t where to put calculated Toeplitz hash value for
+ *  the first tuple.
+ * @param val_2
+ *  Pointer to uint32_t where to put calculated Toeplitz hash value for
+ *  the second tuple.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_x2(const uint64_t *mtrx, const uint8_t *tuple_1,
+	const uint8_t *tuple_2, int len,
+	uint32_t *val_1, uint32_t *val_2)
+{
+	__m512i xor_acc = __rte_thash_gfni(mtrx, tuple_1, tuple_2, len);
+	__rte_thash_xor_reduce(xor_acc, val_1, val_2);
+}
+
+#else /* __GFNI__ */
+
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	return 0;
+}
+
+static inline void
+rte_thash_gfni_x2(const uint64_t *mtrx __rte_unused,
+	const uint8_t *tuple_1 __rte_unused,
+	const uint8_t *tuple_2 __rte_unused, int len __rte_unused,
+	uint32_t *val_1 __rte_unused, uint32_t *val_2 __rte_unused)
+{
+	*val_1 = 0;
+	*val_2 = 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index ce4309a..cecf922 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -39,10 +39,12 @@ EXPERIMENTAL {
 	rte_hash_rcu_qsbr_add;
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
+	rte_thash_complete_matrix;
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
 	rte_thash_get_helper;
 	rte_thash_get_key;
+	rte_thash_gfni_supported;
 	rte_thash_init_ctx;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v2 2/5] hash: enable gfni thash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (6 preceding siblings ...)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-15  9:30 ` Vladimir Medvedkin
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
                   ` (17 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-15  9:30 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 lib/hash/rte_thash.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 59a8b8e..1f1d0cd 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -88,6 +88,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -256,12 +258,30 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512)) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -375,6 +395,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -631,6 +655,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -742,11 +772,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index e4f14a5..7afd359 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -412,6 +412,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index cecf922..3eda695 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -43,6 +43,7 @@ EXPERIMENTAL {
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
+	rte_thash_get_gfni_matrices;
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_gfni_supported;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v2 3/5] doc/hash: update documentation for the thash library
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (7 preceding siblings ...)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-15  9:30 ` Vladimir Medvedkin
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
                   ` (16 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-15  9:30 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds documentation for the new optimized Toeplitz hash
implementation using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
 doc/guides/rel_notes/release_21_11.rst      |  4 ++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..6f50a18 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
+* ``rte_thash_gfni_x2()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of abovementioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_x2()`` calculates for a two independent tuples in one go.
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrixes derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_gfni_x2()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* Two tuple pointers.
+* A length of the longest tuple in bytes.
+* Two pointers on the ``uint32_t`` to write results to.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 4c56cdf..5b53117 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -159,6 +159,10 @@ New Features
   * Added tests to verify tunnel header verification in IPsec inbound.
   * Added tests to verify inner checksum.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v2 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (8 preceding siblings ...)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
@ 2021-10-15  9:30 ` Vladimir Medvedkin
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
                   ` (15 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-15  9:30 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch provides a set of tests for verifying the new
implementation of Toeplitz hash function using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/test_thash.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..dac9caa 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,204 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_X2_DATA_BUF_1_HASH_IDX,
+	GFNI_X2_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_x2(rss_key_matrixes,
+			(uint8_t *)data[0], (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t),
+			&hash[GFNI_X2_DATA_BUF_1_HASH_IDX],
+			&hash[GFNI_X2_DATA_BUF_2_HASH_IDX]);
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_X2_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_X2_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_toeplitz_hash_gfni_x2(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss_v4 = 0;
+	uint32_t rss_v6 = 0;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_x2(rss_key_matrixes, tuples[0], tuples[1],
+			RTE_THASH_V6_L4_LEN * 4, &rss_v4, &rss_v6);
+
+		if ((rss_v4 != v4_tbl[i].hash_l3l4) ||
+				(rss_v6 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported)
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +804,10 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_x2),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v2 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (9 preceding siblings ...)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
@ 2021-10-15  9:30 ` Vladimir Medvedkin
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (14 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-15  9:30 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds performance tests for different implementations
of the Toeplitz hash function.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 125 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index f144d8b..b9c4e78 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -141,6 +141,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -315,6 +316,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+	'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..ccc4710
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t hash_1 = 0;
+	uint32_t hash_2 = 0;
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported)
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j += 2) {
+			rte_thash_gfni_x2(rss_key_matrixes,
+				(uint8_t *)tuples[j], (uint8_t *)tuples[j + 1],
+				len, &hash_1, &hash_2);
+
+			hash ^= hash_1 ^ hash_2;
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] hash: add new toeplitz hash implementation
  2021-10-15  9:11     ` Medvedkin, Vladimir
@ 2021-10-15 10:55       ` Ananyev, Konstantin
  2021-10-15 13:09         ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-15 10:55 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson,
	Bruce, Mcnamara, John


> >> +/**
> >> + * Calculate Toeplitz hash.
> >> + *
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice.
> >> + *
> >> + * @param m
> >> + *  Pointer to the matrices generated from the corresponding
> >> + *  RSS hash key using rte_thash_complete_matrix().
> >> + * @param tuple
> >> + *  Pointer to the data to be hashed. Data must be in network byte order.
> >> + * @param len
> >> + *  Length of the data to be hashed.
> >> + * @return
> >> + *  Calculated Toeplitz hash value.
> >> + */
> >> +__rte_experimental
> >> +static inline uint32_t
> >> +rte_thash_gfni(uint64_t *m, uint8_t *tuple, int len)
> >> +{
> >> +	uint32_t val, val_zero;
> >> +
> >> +	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
> >> +	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
> >> +
> >> +	return val;
> >> +}
> >> +
> >> +/**
> >> + * Calculate Toeplitz hash for two independent data buffers.
> >> + *
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice.
> >> + *
> >> + * @param m
> >> + *  Pointer to the matrices generated from the corresponding
> >> + *  RSS hash key using rte_thash_complete_matrix().
> >> + * @param tuple_1
> >> + *  Pointer to the data to be hashed. Data must be in network byte order.
> >> + * @param tuple_2
> >> + *  Pointer to the data to be hashed. Data must be in network byte order.
> >> + * @param len
> >> + *  Length of the largest data buffer to be hashed.
> >> + * @param val_1
> >> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
> >> + *  the first tuple.
> >> + * @param val_2
> >> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
> >> + *  the second tuple.
> >> + */
> >> +__rte_experimental
> >> +static inline void
> >> +rte_thash_gfni_x2(uint64_t *mtrx, uint8_t *tuple_1, uint8_t *tuple_2, int len,
> >> +	uint32_t *val_1, uint32_t *val_2)
> >
> > Why just two?
> > Why not uint8_t *tuple[]
> > ?
> >
> 
> x2 version was added because there was unused space inside the ZMM which
> holds input key (input tuple) bytes for a second input key, so it helps
> to improve performance in some cases.
> Bulk version wasn't added because for the vast majority of cases it will
> be used with a single input key.
> Hiding this function inside .c will greatly affect performance, because
> it takes just a few cycles to calculate the hash for the most popular
> key sizes.

Ok, but it still unclear to me why for 2 only?
What stops you from doing:
static inline void
rte_thash_gfni_bulk(const uint64_t *mtrx, uint32_t len, uint8_t *tuple[], uint32_t val[], uint32_t num)
{
	for (i = 0; i != (num & ~1); i += 2) {
                    xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i+ 1], len);
                    __rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
	}
	If (num & 1) {
		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
                    	__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
	}  
}
?


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] hash: add new toeplitz hash implementation
  2021-10-15 10:55       ` Ananyev, Konstantin
@ 2021-10-15 13:09         ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-15 13:09 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev
  Cc: Chilikin, Andrey, Wang, Yipeng1, Gobriel, Sameh, Richardson,
	Bruce, Mcnamara, John



On 15/10/2021 12:55, Ananyev, Konstantin wrote:
> 
>>>> +/**
>>>> + * Calculate Toeplitz hash.
>>>> + *
>>>> + * @warning
>>>> + * @b EXPERIMENTAL: this API may change without prior notice.
>>>> + *
>>>> + * @param m
>>>> + *  Pointer to the matrices generated from the corresponding
>>>> + *  RSS hash key using rte_thash_complete_matrix().
>>>> + * @param tuple
>>>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>>>> + * @param len
>>>> + *  Length of the data to be hashed.
>>>> + * @return
>>>> + *  Calculated Toeplitz hash value.
>>>> + */
>>>> +__rte_experimental
>>>> +static inline uint32_t
>>>> +rte_thash_gfni(uint64_t *m, uint8_t *tuple, int len)
>>>> +{
>>>> +	uint32_t val, val_zero;
>>>> +
>>>> +	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
>>>> +	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
>>>> +
>>>> +	return val;
>>>> +}
>>>> +
>>>> +/**
>>>> + * Calculate Toeplitz hash for two independent data buffers.
>>>> + *
>>>> + * @warning
>>>> + * @b EXPERIMENTAL: this API may change without prior notice.
>>>> + *
>>>> + * @param m
>>>> + *  Pointer to the matrices generated from the corresponding
>>>> + *  RSS hash key using rte_thash_complete_matrix().
>>>> + * @param tuple_1
>>>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>>>> + * @param tuple_2
>>>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>>>> + * @param len
>>>> + *  Length of the largest data buffer to be hashed.
>>>> + * @param val_1
>>>> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
>>>> + *  the first tuple.
>>>> + * @param val_2
>>>> + *  Pointer to uint32_t where to put calculated Toeplitz hash value for
>>>> + *  the second tuple.
>>>> + */
>>>> +__rte_experimental
>>>> +static inline void
>>>> +rte_thash_gfni_x2(uint64_t *mtrx, uint8_t *tuple_1, uint8_t *tuple_2, int len,
>>>> +	uint32_t *val_1, uint32_t *val_2)
>>>
>>> Why just two?
>>> Why not uint8_t *tuple[]
>>> ?
>>>
>>
>> x2 version was added because there was unused space inside the ZMM which
>> holds input key (input tuple) bytes for a second input key, so it helps
>> to improve performance in some cases.
>> Bulk version wasn't added because for the vast majority of cases it will
>> be used with a single input key.
>> Hiding this function inside .c will greatly affect performance, because
>> it takes just a few cycles to calculate the hash for the most popular
>> key sizes.
> 
> Ok, but it still unclear to me why for 2 only?
> What stops you from doing:
> static inline void
> rte_thash_gfni_bulk(const uint64_t *mtrx, uint32_t len, uint8_t *tuple[], uint32_t val[], uint32_t num)
> {
> 	for (i = 0; i != (num & ~1); i += 2) {
>                      xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i+ 1], len);
>                      __rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
> 	}
> 	If (num & 1) {
> 		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
>                      	__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
> 	}
> }
> ?
> 

I think you're right. Given that the mathematical properties of this 
hash function are clear and it is very fast, it can be used with 
rte_hash, so bulk version will be useful. I'll replace in v3 the _x2() 
version with _bulk() as you suggested.

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-15 16:58   ` Stephen Hemminger
  2021-10-18 10:40     ` Ananyev, Konstantin
  2021-10-18 11:08     ` Medvedkin, Vladimir
  0 siblings, 2 replies; 72+ messages in thread
From: Stephen Hemminger @ 2021-10-15 16:58 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson, konstantin.ananyev

On Fri, 15 Oct 2021 10:30:02 +0100
Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:

> +			m[i * 8 + j] = (rss_key[i] << j)|
> +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
> +				(8 - j));
> +		}

This ends up being harder than necessary to read. Maybe split into
multiple statements and/or use temporary variable.

> +RTE_INIT(rte_thash_gfni_init)
> +{
> +	rte_thash_gfni_supported = 0;

Not necessary in C globals are initialized to zero by default.

By removing that the constructor can be totally behind #ifdef

> +__rte_internal
> +static inline __m512i
> +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
> +	const uint8_t *secondary_tuple, int len)
> +{
> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
> +						6, 5, 4, 3, 6, 5, 4, 3,
> +						5, 4, 3, 2, 5, 4, 3, 2,
> +						4, 3, 2, 1, 4, 3, 2, 1,
> +						3, 2, 1, 0, 3, 2, 1, 0,
> +						2, 1, 0, -1, 2, 1, 0, -1,
> +						1, 0, -1, -2, 1, 0, -1, -2,
> +						0, -1, -2, -3, 0, -1, -2, -3);

NAK

Please don't put the implementation in an inline. This makes it harder
to support (API/ABI) and blocks other architectures from implementing
same thing with different instructions.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation
  2021-10-15 16:58   ` Stephen Hemminger
@ 2021-10-18 10:40     ` Ananyev, Konstantin
  2021-10-19  1:15       ` Stephen Hemminger
  2021-10-18 11:08     ` Medvedkin, Vladimir
  1 sibling, 1 reply; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-18 10:40 UTC (permalink / raw)
  To: Stephen Hemminger, Medvedkin, Vladimir
  Cc: dev, Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce


> On Fri, 15 Oct 2021 10:30:02 +0100
> Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
> 
> > +			m[i * 8 + j] = (rss_key[i] << j)|
> > +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
> > +				(8 - j));
> > +		}
> 
> This ends up being harder than necessary to read. Maybe split into
> multiple statements and/or use temporary variable.
> 
> > +RTE_INIT(rte_thash_gfni_init)
> > +{
> > +	rte_thash_gfni_supported = 0;
> 
> Not necessary in C globals are initialized to zero by default.
> 
> By removing that the constructor can be totally behind #ifdef
> 
> > +__rte_internal
> > +static inline __m512i
> > +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
> > +	const uint8_t *secondary_tuple, int len)
> > +{
> > +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
> > +						6, 5, 4, 3, 6, 5, 4, 3,
> > +						5, 4, 3, 2, 5, 4, 3, 2,
> > +						4, 3, 2, 1, 4, 3, 2, 1,
> > +						3, 2, 1, 0, 3, 2, 1, 0,
> > +						2, 1, 0, -1, 2, 1, 0, -1,
> > +						1, 0, -1, -2, 1, 0, -1, -2,
> > +						0, -1, -2, -3, 0, -1, -2, -3);
> 
> NAK
> 
> Please don't put the implementation in an inline. This makes it harder
> to support (API/ABI) and blocks other architectures from implementing
> same thing with different instructions.

I don't really understand your reasoning here.
rte_thash_gfni.h is an arch-specific header, which provides
arch-specific optimizations for RSS hash calculation
(Vladimir pls correct me if I am wrong here).
We do have dozens of inline functions that do use arch-specific instructions (both x86 and arm)
for different purposes:
sync primitives, memory-ordering, cache manipulations, LPM lookup, TSX, power-saving, etc.
That's a usual trade-off taken for performance reasons, when extra function call
costs too much comparing to the operation itself.
Why it suddenly became a problem for that particular case and how exactly it blocks other architectures?
Also I don't understand how it makes things harder in terms of API/ABI stability.
As I can see this patch doesn't introduce any public structs/unions.
All functions take as arguments just raw data buffers and length.
To summarize - in general, I don't see any good reason why this patch shouldn't be allowed.
Konstantin
 




^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation
  2021-10-15 16:58   ` Stephen Hemminger
  2021-10-18 10:40     ` Ananyev, Konstantin
@ 2021-10-18 11:08     ` Medvedkin, Vladimir
  1 sibling, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-18 11:08 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson, konstantin.ananyev

Hi Stephen,

Thanks for reviewing

On 15/10/2021 18:58, Stephen Hemminger wrote:
> On Fri, 15 Oct 2021 10:30:02 +0100
> Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
> 
>> +			m[i * 8 + j] = (rss_key[i] << j)|
>> +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
>> +				(8 - j));
>> +		}
> 
> This ends up being harder than necessary to read. Maybe split into
> multiple statements and/or use temporary variable.
> 
>> +RTE_INIT(rte_thash_gfni_init)
>> +{
>> +	rte_thash_gfni_supported = 0;
> 
> Not necessary in C globals are initialized to zero by default.
> 
> By removing that the constructor can be totally behind #ifdef
> 
>> +__rte_internal
>> +static inline __m512i
>> +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
>> +	const uint8_t *secondary_tuple, int len)
>> +{
>> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
>> +						6, 5, 4, 3, 6, 5, 4, 3,
>> +						5, 4, 3, 2, 5, 4, 3, 2,
>> +						4, 3, 2, 1, 4, 3, 2, 1,
>> +						3, 2, 1, 0, 3, 2, 1, 0,
>> +						2, 1, 0, -1, 2, 1, 0, -1,
>> +						1, 0, -1, -2, 1, 0, -1, -2,
>> +						0, -1, -2, -3, 0, -1, -2, -3);
> 
> NAK
> 
> Please don't put the implementation in an inline. This makes it harder
> to support (API/ABI) and blocks other architectures from implementing
> same thing with different instructions.
> 

By making this function not inline, its performance drops by about 2 
times. Compiler optimization (at least with respect to the len argument) 
helps a lot in the implementation.


-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation
  2021-10-18 10:40     ` Ananyev, Konstantin
@ 2021-10-19  1:15       ` Stephen Hemminger
  2021-10-19 15:42         ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Stephen Hemminger @ 2021-10-19  1:15 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Medvedkin, Vladimir, dev, Wang, Yipeng1, Gobriel, Sameh,
	Richardson, Bruce

On Mon, 18 Oct 2021 10:40:00 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> > On Fri, 15 Oct 2021 10:30:02 +0100
> > Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
> >   
> > > +			m[i * 8 + j] = (rss_key[i] << j)|
> > > +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
> > > +				(8 - j));
> > > +		}  
> > 
> > This ends up being harder than necessary to read. Maybe split into
> > multiple statements and/or use temporary variable.
> >   
> > > +RTE_INIT(rte_thash_gfni_init)
> > > +{
> > > +	rte_thash_gfni_supported = 0;  
> > 
> > Not necessary in C globals are initialized to zero by default.
> > 
> > By removing that the constructor can be totally behind #ifdef
> >   
> > > +__rte_internal
> > > +static inline __m512i
> > > +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
> > > +	const uint8_t *secondary_tuple, int len)
> > > +{
> > > +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
> > > +						6, 5, 4, 3, 6, 5, 4, 3,
> > > +						5, 4, 3, 2, 5, 4, 3, 2,
> > > +						4, 3, 2, 1, 4, 3, 2, 1,
> > > +						3, 2, 1, 0, 3, 2, 1, 0,
> > > +						2, 1, 0, -1, 2, 1, 0, -1,
> > > +						1, 0, -1, -2, 1, 0, -1, -2,
> > > +						0, -1, -2, -3, 0, -1, -2, -3);  
> > 
> > NAK
> > 
> > Please don't put the implementation in an inline. This makes it harder
> > to support (API/ABI) and blocks other architectures from implementing
> > same thing with different instructions.  
> 
> I don't really understand your reasoning here.
> rte_thash_gfni.h is an arch-specific header, which provides
> arch-specific optimizations for RSS hash calculation
> (Vladimir pls correct me if I am wrong here).

Ok, but rte_thash_gfni.h is included on all architectures.

> We do have dozens of inline functions that do use arch-specific instructions (both x86 and arm)
> for different purposes:
> sync primitives, memory-ordering, cache manipulations, LPM lookup, TSX, power-saving, etc.
> That's a usual trade-off taken for performance reasons, when extra function call
> costs too much comparing to the operation itself.
> Why it suddenly became a problem for that particular case and how exactly it blocks other architectures?
> Also I don't understand how it makes things harder in terms of API/ABI stability.
> As I can see this patch doesn't introduce any public structs/unions.
> All functions take as arguments just raw data buffers and length.
> To summarize - in general, I don't see any good reason why this patch shouldn't be allowed.
> Konstantin

The comments about rte_thash_gfni_supported initialization still apply.
Why not:

#ifdef __GFNI__
RTE_INIT(rte_thash_gfni_init)
{
	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI))
		rte_thash_gfni_supported = 1;
}
#endif

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation
  2021-10-19  1:15       ` Stephen Hemminger
@ 2021-10-19 15:42         ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-19 15:42 UTC (permalink / raw)
  To: Stephen Hemminger, Ananyev, Konstantin
  Cc: dev, Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce

Hi Stephen,

On 19/10/2021 03:15, Stephen Hemminger wrote:
> On Mon, 18 Oct 2021 10:40:00 +0000
> "Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:
> 
>>> On Fri, 15 Oct 2021 10:30:02 +0100
>>> Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
>>>    
>>>> +			m[i * 8 + j] = (rss_key[i] << j)|
>>>> +				(uint8_t)((uint16_t)(rss_key[i + 1]) >>
>>>> +				(8 - j));
>>>> +		}
>>>
>>> This ends up being harder than necessary to read. Maybe split into
>>> multiple statements and/or use temporary variable.
>>>    
>>>> +RTE_INIT(rte_thash_gfni_init)
>>>> +{
>>>> +	rte_thash_gfni_supported = 0;
>>>
>>> Not necessary in C globals are initialized to zero by default.
>>>
>>> By removing that the constructor can be totally behind #ifdef
>>>    
>>>> +__rte_internal
>>>> +static inline __m512i
>>>> +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
>>>> +	const uint8_t *secondary_tuple, int len)
>>>> +{
>>>> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
>>>> +						6, 5, 4, 3, 6, 5, 4, 3,
>>>> +						5, 4, 3, 2, 5, 4, 3, 2,
>>>> +						4, 3, 2, 1, 4, 3, 2, 1,
>>>> +						3, 2, 1, 0, 3, 2, 1, 0,
>>>> +						2, 1, 0, -1, 2, 1, 0, -1,
>>>> +						1, 0, -1, -2, 1, 0, -1, -2,
>>>> +						0, -1, -2, -3, 0, -1, -2, -3);
>>>
>>> NAK
>>>
>>> Please don't put the implementation in an inline. This makes it harder
>>> to support (API/ABI) and blocks other architectures from implementing
>>> same thing with different instructions.
>>
>> I don't really understand your reasoning here.
>> rte_thash_gfni.h is an arch-specific header, which provides
>> arch-specific optimizations for RSS hash calculation
>> (Vladimir pls correct me if I am wrong here).
> 
> Ok, but rte_thash_gfni.h is included on all architectures.
> 

Ok, I'll rework the patch to move x86 + avx512 related things into x86 
arch specific header. Would that suit?

>> We do have dozens of inline functions that do use arch-specific instructions (both x86 and arm)
>> for different purposes:
>> sync primitives, memory-ordering, cache manipulations, LPM lookup, TSX, power-saving, etc.
>> That's a usual trade-off taken for performance reasons, when extra function call
>> costs too much comparing to the operation itself.
>> Why it suddenly became a problem for that particular case and how exactly it blocks other architectures?
>> Also I don't understand how it makes things harder in terms of API/ABI stability.
>> As I can see this patch doesn't introduce any public structs/unions.
>> All functions take as arguments just raw data buffers and length.
>> To summarize - in general, I don't see any good reason why this patch shouldn't be allowed.
>> Konstantin
> 
> The comments about rte_thash_gfni_supported initialization still apply.
> Why not:
> 
> #ifdef __GFNI__
> RTE_INIT(rte_thash_gfni_init)
> {
> 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI))
> 		rte_thash_gfni_supported = 1;
> }
> #endif
> 

Agree, I'll reflect this changes in v3.


-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v3 0/5] optimized Toeplitz hash implementation
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-20 18:20   ` Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
                       ` (5 more replies)
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
                     ` (4 subsequent siblings)
  5 siblings, 6 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-20 18:20 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v3:
- implementation moved to x86 specific header
- added rte_thash_gfni_supported() instead of the variable
- removed RTE_INIT section, due to adding rte_thash_gfni_supported()
- reworked rte_thash_complete_matrix() to make it easier tor read

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (5):
  hash: add new toeplitz hash implementation
  hash: enable gfni thash implementation
  doc/hash: update documentation for the thash library
  test/thash: add tests for a new Toeplitz hash function
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 237 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 120 ++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  37 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   1 +
 lib/hash/rte_thash.c                        |  71 ++++++++-
 lib/hash/rte_thash.h                        |  54 +++++++
 lib/hash/rte_thash_gfni.h                   |  85 ++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 221 ++++++++++++++++++++++++++
 lib/hash/version.map                        |   3 +
 12 files changed, 828 insertions(+), 8 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
@ 2021-10-20 18:20   ` Vladimir Medvedkin
  2021-10-21  9:42     ` Ananyev, Konstantin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-20 18:20 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/api/doxy-api-index.md     |   1 +
 lib/hash/meson.build          |   1 +
 lib/hash/rte_thash.c          |  29 ++++++
 lib/hash/rte_thash.h          |  35 +++++++
 lib/hash/rte_thash_gfni.h     |  85 ++++++++++++++++
 lib/hash/rte_thash_x86_gfni.h | 221 ++++++++++++++++++++++++++++++++++++++++++
 lib/hash/version.map          |   2 +
 7 files changed, 374 insertions(+)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 1992107..7549477 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -139,6 +139,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..40444ac 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,6 +7,7 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
 )
 indirect_headers += files('rte_crc_arm64.h')
 
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..e605a6f 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,35 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 76109fc..a406be0 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -28,6 +28,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -223,6 +224,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..f59587f
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+/**
+ * Bulk implementation for Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
+	int len __rte_unused, uint8_t *tuple[] __rte_unused,
+	uint32_t val[], uint32_t num)
+{
+	unsigned int i;
+
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	for (i = 0; i < num; i++)
+		val[i] = 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..faa340a
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+/**
+ * Bulk implementation for Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
+	uint32_t val[], uint32_t num)
+{
+	uint32_t i;
+	uint32_t val_zero;
+	__m512i xor_acc;
+
+	for (i = 0; i != (num & ~1); i += 2) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
+		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
+	}
+
+	if (num & 1) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
+		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
+	}
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index ce4309a..cecf922 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -39,10 +39,12 @@ EXPERIMENTAL {
 	rte_hash_rcu_qsbr_add;
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
+	rte_thash_complete_matrix;
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
 	rte_thash_get_helper;
 	rte_thash_get_key;
+	rte_thash_gfni_supported;
 	rte_thash_init_ctx;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-20 18:20   ` Vladimir Medvedkin
  2021-10-21  9:46     ` Ananyev, Konstantin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-20 18:20 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index e605a6f..242d0ff 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -87,6 +87,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -266,12 +268,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported()) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -385,6 +403,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -641,6 +663,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -752,11 +780,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a406be0..d12ab81 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -423,6 +423,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index cecf922..3eda695 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -43,6 +43,7 @@ EXPERIMENTAL {
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
+	rte_thash_get_gfni_matrices;
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_gfni_supported;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v3 3/5] doc/hash: update documentation for the thash library
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                     ` (2 preceding siblings ...)
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-20 18:20   ` Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-20 18:20 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds documentation for the new optimized Toeplitz hash
implementation using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
 doc/guides/rel_notes/release_21_11.rst      |  4 ++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..6f50a18 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
+* ``rte_thash_gfni_x2()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of abovementioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_x2()`` calculates for a two independent tuples in one go.
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrixes derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_gfni_x2()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* Two tuple pointers.
+* A length of the longest tuple in bytes.
+* Two pointers on the ``uint32_t`` to write results to.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 4c56cdf..5b53117 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -159,6 +159,10 @@ New Features
   * Added tests to verify tunnel header verification in IPsec inbound.
   * Added tests to verify inner checksum.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v3 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                     ` (3 preceding siblings ...)
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
@ 2021-10-20 18:20   ` Vladimir Medvedkin
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-20 18:20 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch provides a set of tests for verifying the new
implementation of Toeplitz hash function using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/test_thash.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..a625306 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,210 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_BULK_DATA_BUF_1_HASH_IDX,
+	GFNI_BULK_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+	uint8_t *bulk_data[2];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < 2; i++)
+		bulk_data[i] = (uint8_t *)data[i];
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_bulk(rss_key_matrixes,
+			DATA_SZ * sizeof(uint32_t), bulk_data,
+			&hash[GFNI_BULK_DATA_BUF_1_HASH_IDX], 2);
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_toeplitz_hash_gfni_bulk(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss[2] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_bulk(rss_key_matrixes, RTE_THASH_V6_L4_LEN * 4,
+			tuples, rss, 2);
+
+		if ((rss[RSS_V4_IDX] != v4_tbl[i].hash_l3l4) ||
+				(rss[RSS_V6_IDX] != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +810,10 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_bulk),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v3 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                     ` (4 preceding siblings ...)
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
@ 2021-10-20 18:20   ` Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-20 18:20 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds performance tests for different implementations
of the Toeplitz hash function.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index f144d8b..b9c4e78 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -141,6 +141,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -315,6 +316,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+	'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..fb66e20
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
+			bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-21  9:42     ` Ananyev, Konstantin
  2021-10-21 17:17       ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-21  9:42 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev
  Cc: Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce, stephen


> This patch add a new Toeplitz hash implementation using
> Galios Fields New Instructions (GFNI).
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  doc/api/doxy-api-index.md     |   1 +
>  lib/hash/meson.build          |   1 +
>  lib/hash/rte_thash.c          |  29 ++++++
>  lib/hash/rte_thash.h          |  35 +++++++
>  lib/hash/rte_thash_gfni.h     |  85 ++++++++++++++++
>  lib/hash/rte_thash_x86_gfni.h | 221 ++++++++++++++++++++++++++++++++++++++++++
>  lib/hash/version.map          |   2 +
>  7 files changed, 374 insertions(+)
>  create mode 100644 lib/hash/rte_thash_gfni.h
>  create mode 100644 lib/hash/rte_thash_x86_gfni.h
> 
> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> index 1992107..7549477 100644
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -139,6 +139,7 @@ The public API headers are grouped by topics:
>    [hash]               (@ref rte_hash.h),
>    [jhash]              (@ref rte_jhash.h),
>    [thash]              (@ref rte_thash.h),
> +  [thash_gfni]         (@ref rte_thash_gfni.h),
>    [FBK hash]           (@ref rte_fbk_hash.h),
>    [CRC hash]           (@ref rte_hash_crc.h)
> 
> diff --git a/lib/hash/meson.build b/lib/hash/meson.build
> index 9bc5ef9..40444ac 100644
> --- a/lib/hash/meson.build
> +++ b/lib/hash/meson.build
> @@ -7,6 +7,7 @@ headers = files(
>          'rte_hash.h',
>          'rte_jhash.h',
>          'rte_thash.h',
> +        'rte_thash_gfni.h',
>  )
>  indirect_headers += files('rte_crc_arm64.h')
> 
> diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
> index 696a112..e605a6f 100644
> --- a/lib/hash/rte_thash.c
> +++ b/lib/hash/rte_thash.c
> @@ -90,6 +90,35 @@ struct rte_thash_ctx {
>  	uint8_t		hash_key[0];
>  };
> 
> +int
> +rte_thash_gfni_supported(void)
> +{
> +#ifdef RTE_THASH_GFNI_DEFINED
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
> +			(rte_vect_get_max_simd_bitwidth() >=
> +			RTE_VECT_SIMD_512))
> +		return 1;
> +#endif
> +
> +	return 0;
> +};
> +
> +void
> +rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
> +{
> +	int i, j;
> +	uint8_t *m = (uint8_t *)matrixes;
> +	uint8_t left_part, right_part;
> +
> +	for (i = 0; i < size; i++) {
> +		for (j = 0; j < 8; j++) {
> +			left_part = rss_key[i] << j;
> +			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
> +			m[i * 8 + j] = left_part|right_part;
> +		}
> +	}
> +}
> +
>  static inline uint32_t
>  get_bit_lfsr(struct thash_lfsr *lfsr)
>  {
> diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
> index 76109fc..a406be0 100644
> --- a/lib/hash/rte_thash.h
> +++ b/lib/hash/rte_thash.h
> @@ -28,6 +28,7 @@ extern "C" {
>  #include <rte_config.h>
>  #include <rte_ip.h>
>  #include <rte_common.h>
> +#include <rte_thash_gfni.h>
> 
>  #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
>  #include <rte_vect.h>
> @@ -223,6 +224,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
>  	return ret;
>  }
> 
> +/**
> + * Indicates if GFNI implementations of the Toeplitz hash are supported.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @return
> + *  1 if GFNI is supported
> + *  0 otherwise
> + */
> +__rte_experimental
> +int
> +rte_thash_gfni_supported(void);
> +
> +/**
> + * Converts Toeplitz hash key (RSS key) into matrixes required
> + * for GFNI implementation
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param matrixes
> + *  pointer to the memory where matrices will be written.
> + *  Note: the size of this memory must be equal to size * 8
> + * @param rss_key
> + *  pointer to the Toeplitz hash key
> + * @param size
> + *  Size of the rss_key in bytes.
> + */
> +__rte_experimental
> +void
> +rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
> +	int size);
> +
>  /** @internal Logarithm of minimum size of the RSS ReTa */
>  #define	RTE_THASH_RETA_SZ_MIN	2U
>  /** @internal Logarithm of maximum size of the RSS ReTa */
> diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
> new file mode 100644
> index 0000000..f59587f
> --- /dev/null
> +++ b/lib/hash/rte_thash_gfni.h
> @@ -0,0 +1,85 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Intel Corporation
> + */
> +
> +#ifndef _RTE_THASH_GFNI_H_
> +#define _RTE_THASH_GFNI_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#ifdef RTE_ARCH_X86
> +
> +#include <rte_thash_x86_gfni.h>
> +
> +#endif
> +
> +#ifndef RTE_THASH_GFNI_DEFINED
> +
> +/**
> + * Calculate Toeplitz hash.
> + * Dummy implementation.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param m
> + *  Pointer to the matrices generated from the corresponding
> + *  RSS hash key using rte_thash_complete_matrix().
> + * @param tuple
> + *  Pointer to the data to be hashed. Data must be in network byte order.
> + * @param len
> + *  Length of the data to be hashed.
> + * @return
> + *  Calculated Toeplitz hash value.
> + */
> +__rte_experimental
> +static inline uint32_t
> +rte_thash_gfni(const uint64_t *mtrx __rte_unused,
> +	const uint8_t *key __rte_unused, int len __rte_unused)
> +{
> +	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);

One nit: as I can see from test report some compilation fails.
Probably we need to add #include <rte_log.h> to that file.
Apart from that, LGTM.
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>


> +	return 0;
> +}
> +
> +/**
> + * Bulk implementation for Toeplitz hash.
> + * Dummy implementation.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param m
> + *  Pointer to the matrices generated from the corresponding
> + *  RSS hash key using rte_thash_complete_matrix().
> + * @param tuple
> + *  Array of the pointers on data to be hashed.
> + *  Data must be in network byte order.
> + * @param len
> + *  Length of the largest data buffer to be hashed.
> + * @param val
> + *  Array of uint32_t where to put calculated Toeplitz hash values
> + * @param num
> + *  Number of tuples to hash.
> + */
> +__rte_experimental
> +static inline void
> +rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
> +	int len __rte_unused, uint8_t *tuple[] __rte_unused,
> +	uint32_t val[], uint32_t num)
> +{
> +	unsigned int i;
> +
> +	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
> +	for (i = 0; i < num; i++)
> +		val[i] = 0;
> +}
> +
> +#endif /* RTE_THASH_GFNI_DEFINED */
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_THASH_GFNI_H_ */
> diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
> new file mode 100644
> index 0000000..faa340a
> --- /dev/null
> +++ b/lib/hash/rte_thash_x86_gfni.h
> @@ -0,0 +1,221 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Intel Corporation
> + */
> +
> +#ifndef _RTE_THASH_X86_GFNI_H_
> +#define _RTE_THASH_X86_GFNI_H_
> +
> +/**
> + * @file
> + *
> + * Optimized Toeplitz hash functions implementation
> + * using Galois Fields New Instructions.
> + */
> +
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#ifdef __GFNI__
> +#define RTE_THASH_GFNI_DEFINED
> +
> +#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
> +#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
> +#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
> +#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
> +#define RTE_THASH_REWIND_MSK		0x0000000000113377
> +
> +__rte_internal
> +static inline void
> +__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
> +{
> +	__m256i tmp_256_1, tmp_256_2;
> +	__m128i tmp128_1, tmp128_2;
> +	uint64_t tmp_1, tmp_2;
> +
> +	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
> +	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
> +	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
> +
> +	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
> +	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
> +	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
> +
> +	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
> +	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
> +	tmp_1 ^= tmp_2;
> +
> +	*val_1 = (uint32_t)tmp_1;
> +	*val_2 = (uint32_t)(tmp_1 >> 32);
> +}
> +
> +__rte_internal
> +static inline __m512i
> +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
> +	const uint8_t *secondary_tuple, int len)
> +{
> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
> +						6, 5, 4, 3, 6, 5, 4, 3,
> +						5, 4, 3, 2, 5, 4, 3, 2,
> +						4, 3, 2, 1, 4, 3, 2, 1,
> +						3, 2, 1, 0, 3, 2, 1, 0,
> +						2, 1, 0, -1, 2, 1, 0, -1,
> +						1, 0, -1, -2, 1, 0, -1, -2,
> +						0, -1, -2, -3, 0, -1, -2, -3);
> +
> +	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 0, 0, 0, 0, 0,
> +						0, 0, 0, 59, 0, 0, 0, 59,
> +						0, 0, 59, 58, 0, 0, 59, 58,
> +						0, 59, 58, 57, 0, 59, 58, 57);
> +	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
> +	const __m512i shift_8 = _mm512_set1_epi8(8);
> +	__m512i xor_acc = _mm512_setzero_si512();
> +	__m512i perm_bytes = _mm512_setzero_si512();
> +	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
> +	__mmask64 load_mask, permute_mask, permute_mask_2;
> +	int chunk_len = 0, i = 0;
> +	uint8_t mtrx_msk;
> +	const int prepend = 3;
> +
> +	for (; len > 0; len -= 64, tuple += 64) {
> +		if (i == 8)
> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
> +				rewind_idx, perm_bytes);
> +
> +		permute_mask = RTE_THASH_FIRST_ITER_MSK;
> +		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
> +		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
> +		if (secondary_tuple) {
> +			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
> +			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
> +				secondary_tuple);
> +		}
> +
> +		chunk_len = __builtin_popcountll(load_mask);
> +		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
> +				permute_mask, permute_idx, tuple_bytes);
> +
> +			if (secondary_tuple)
> +				perm_bytes =
> +					_mm512_mask_permutexvar_epi8(perm_bytes,
> +					permute_mask_2, permute_idx,
> +					tuple_bytes_2);
> +
> +			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
> +			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
> +				matrixes, 0);
> +
> +			xor_acc = _mm512_xor_si512(xor_acc, vals);
> +			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
> +			permute_mask = RTE_THASH_PERM_MSK;
> +			if (secondary_tuple)
> +				permute_mask_2 = RTE_THASH_PERM_MSK_2;
> +		}
> +	}
> +
> +	int rest_len = (chunk_len + prepend) % 8;
> +	if (rest_len != 0) {
> +		mtrx_msk = (1 << (rest_len % 8)) - 1;
> +		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
> +		if (i == 8) {
> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
> +				rewind_idx, perm_bytes);
> +		} else {
> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
> +				permute_mask, permute_idx, tuple_bytes);
> +
> +			if (secondary_tuple)
> +				perm_bytes =
> +					_mm512_mask_permutexvar_epi8(
> +					perm_bytes, permute_mask_2,
> +					permute_idx, tuple_bytes_2);
> +		}
> +
> +		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
> +		xor_acc = _mm512_xor_si512(xor_acc, vals);
> +	}
> +
> +	return xor_acc;
> +}
> +
> +/**
> + * Calculate Toeplitz hash.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param m
> + *  Pointer to the matrices generated from the corresponding
> + *  RSS hash key using rte_thash_complete_matrix().
> + * @param tuple
> + *  Pointer to the data to be hashed. Data must be in network byte order.
> + * @param len
> + *  Length of the data to be hashed.
> + * @return
> + *  Calculated Toeplitz hash value.
> + */
> +__rte_experimental
> +static inline uint32_t
> +rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
> +{
> +	uint32_t val, val_zero;
> +
> +	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
> +	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
> +
> +	return val;
> +}
> +
> +/**
> + * Bulk implementation for Toeplitz hash.
> + *
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * @param m
> + *  Pointer to the matrices generated from the corresponding
> + *  RSS hash key using rte_thash_complete_matrix().
> + * @param tuple
> + *  Array of the pointers on data to be hashed.
> + *  Data must be in network byte order.
> + * @param len
> + *  Length of the largest data buffer to be hashed.
> + * @param val
> + *  Array of uint32_t where to put calculated Toeplitz hash values
> + * @param num
> + *  Number of tuples to hash.
> + */
> +__rte_experimental
> +static inline void
> +rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
> +	uint32_t val[], uint32_t num)
> +{
> +	uint32_t i;
> +	uint32_t val_zero;
> +	__m512i xor_acc;
> +
> +	for (i = 0; i != (num & ~1); i += 2) {
> +		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
> +		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
> +	}
> +
> +	if (num & 1) {
> +		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
> +		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
> +	}
> +}
> +
> +#endif /* _GFNI_ */
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_THASH_X86_GFNI_H_ */
> diff --git a/lib/hash/version.map b/lib/hash/version.map
> index ce4309a..cecf922 100644
> --- a/lib/hash/version.map
> +++ b/lib/hash/version.map
> @@ -39,10 +39,12 @@ EXPERIMENTAL {
>  	rte_hash_rcu_qsbr_add;
>  	rte_thash_add_helper;
>  	rte_thash_adjust_tuple;
> +	rte_thash_complete_matrix;
>  	rte_thash_find_existing;
>  	rte_thash_free_ctx;
>  	rte_thash_get_complement;
>  	rte_thash_get_helper;
>  	rte_thash_get_key;
> +	rte_thash_gfni_supported;
>  	rte_thash_init_ctx;
>  };
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-21  9:46     ` Ananyev, Konstantin
  0 siblings, 0 replies; 72+ messages in thread
From: Ananyev, Konstantin @ 2021-10-21  9:46 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev
  Cc: Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce, stephen



> -----Original Message-----
> From: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>
> Sent: Wednesday, October 20, 2021 7:20 PM
> To: dev@dpdk.org
> Cc: Wang, Yipeng1 <yipeng1.wang@intel.com>; Gobriel, Sameh <sameh.gobriel@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>; stephen@networkplumber.org
> Subject: [PATCH v3 2/5] hash: enable gfni thash implementation
> 
> This patch enables new GFNI Toeplitz hash in
> predictable RSS library.
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
>  lib/hash/rte_thash.h | 19 +++++++++++++++++++
>  lib/hash/version.map |  1 +
>  3 files changed, 58 insertions(+), 4 deletions(-)
> 

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> --
> 2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation
  2021-10-21  9:42     ` Ananyev, Konstantin
@ 2021-10-21 17:17       ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-21 17:17 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev
  Cc: Wang, Yipeng1, Gobriel, Sameh, Richardson, Bruce, stephen

Hi Konstantin,

On 21/10/2021 11:42, Ananyev, Konstantin wrote:
> 
>> This patch add a new Toeplitz hash implementation using
>> Galios Fields New Instructions (GFNI).
>>
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>> ---
>>   doc/api/doxy-api-index.md     |   1 +
>>   lib/hash/meson.build          |   1 +
>>   lib/hash/rte_thash.c          |  29 ++++++
>>   lib/hash/rte_thash.h          |  35 +++++++
>>   lib/hash/rte_thash_gfni.h     |  85 ++++++++++++++++
>>   lib/hash/rte_thash_x86_gfni.h | 221 ++++++++++++++++++++++++++++++++++++++++++
>>   lib/hash/version.map          |   2 +
>>   7 files changed, 374 insertions(+)
>>   create mode 100644 lib/hash/rte_thash_gfni.h
>>   create mode 100644 lib/hash/rte_thash_x86_gfni.h
>>
>> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
>> index 1992107..7549477 100644
>> --- a/doc/api/doxy-api-index.md
>> +++ b/doc/api/doxy-api-index.md
>> @@ -139,6 +139,7 @@ The public API headers are grouped by topics:
>>     [hash]               (@ref rte_hash.h),
>>     [jhash]              (@ref rte_jhash.h),
>>     [thash]              (@ref rte_thash.h),
>> +  [thash_gfni]         (@ref rte_thash_gfni.h),
>>     [FBK hash]           (@ref rte_fbk_hash.h),
>>     [CRC hash]           (@ref rte_hash_crc.h)
>>
>> diff --git a/lib/hash/meson.build b/lib/hash/meson.build
>> index 9bc5ef9..40444ac 100644
>> --- a/lib/hash/meson.build
>> +++ b/lib/hash/meson.build
>> @@ -7,6 +7,7 @@ headers = files(
>>           'rte_hash.h',
>>           'rte_jhash.h',
>>           'rte_thash.h',
>> +        'rte_thash_gfni.h',
>>   )
>>   indirect_headers += files('rte_crc_arm64.h')
>>
>> diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
>> index 696a112..e605a6f 100644
>> --- a/lib/hash/rte_thash.c
>> +++ b/lib/hash/rte_thash.c
>> @@ -90,6 +90,35 @@ struct rte_thash_ctx {
>>   	uint8_t		hash_key[0];
>>   };
>>
>> +int
>> +rte_thash_gfni_supported(void)
>> +{
>> +#ifdef RTE_THASH_GFNI_DEFINED
>> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
>> +			(rte_vect_get_max_simd_bitwidth() >=
>> +			RTE_VECT_SIMD_512))
>> +		return 1;
>> +#endif
>> +
>> +	return 0;
>> +};
>> +
>> +void
>> +rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
>> +{
>> +	int i, j;
>> +	uint8_t *m = (uint8_t *)matrixes;
>> +	uint8_t left_part, right_part;
>> +
>> +	for (i = 0; i < size; i++) {
>> +		for (j = 0; j < 8; j++) {
>> +			left_part = rss_key[i] << j;
>> +			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
>> +			m[i * 8 + j] = left_part|right_part;
>> +		}
>> +	}
>> +}
>> +
>>   static inline uint32_t
>>   get_bit_lfsr(struct thash_lfsr *lfsr)
>>   {
>> diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
>> index 76109fc..a406be0 100644
>> --- a/lib/hash/rte_thash.h
>> +++ b/lib/hash/rte_thash.h
>> @@ -28,6 +28,7 @@ extern "C" {
>>   #include <rte_config.h>
>>   #include <rte_ip.h>
>>   #include <rte_common.h>
>> +#include <rte_thash_gfni.h>
>>
>>   #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
>>   #include <rte_vect.h>
>> @@ -223,6 +224,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
>>   	return ret;
>>   }
>>
>> +/**
>> + * Indicates if GFNI implementations of the Toeplitz hash are supported.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @return
>> + *  1 if GFNI is supported
>> + *  0 otherwise
>> + */
>> +__rte_experimental
>> +int
>> +rte_thash_gfni_supported(void);
>> +
>> +/**
>> + * Converts Toeplitz hash key (RSS key) into matrixes required
>> + * for GFNI implementation
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param matrixes
>> + *  pointer to the memory where matrices will be written.
>> + *  Note: the size of this memory must be equal to size * 8
>> + * @param rss_key
>> + *  pointer to the Toeplitz hash key
>> + * @param size
>> + *  Size of the rss_key in bytes.
>> + */
>> +__rte_experimental
>> +void
>> +rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
>> +	int size);
>> +
>>   /** @internal Logarithm of minimum size of the RSS ReTa */
>>   #define	RTE_THASH_RETA_SZ_MIN	2U
>>   /** @internal Logarithm of maximum size of the RSS ReTa */
>> diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
>> new file mode 100644
>> index 0000000..f59587f
>> --- /dev/null
>> +++ b/lib/hash/rte_thash_gfni.h
>> @@ -0,0 +1,85 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2021 Intel Corporation
>> + */
>> +
>> +#ifndef _RTE_THASH_GFNI_H_
>> +#define _RTE_THASH_GFNI_H_
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#ifdef RTE_ARCH_X86
>> +
>> +#include <rte_thash_x86_gfni.h>
>> +
>> +#endif
>> +
>> +#ifndef RTE_THASH_GFNI_DEFINED
>> +
>> +/**
>> + * Calculate Toeplitz hash.
>> + * Dummy implementation.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param m
>> + *  Pointer to the matrices generated from the corresponding
>> + *  RSS hash key using rte_thash_complete_matrix().
>> + * @param tuple
>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>> + * @param len
>> + *  Length of the data to be hashed.
>> + * @return
>> + *  Calculated Toeplitz hash value.
>> + */
>> +__rte_experimental
>> +static inline uint32_t
>> +rte_thash_gfni(const uint64_t *mtrx __rte_unused,
>> +	const uint8_t *key __rte_unused, int len __rte_unused)
>> +{
>> +	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
> 
> One nit: as I can see from test report some compilation fails.
> Probably we need to add #include <rte_log.h> to that file.
> Apart from that, LGTM.
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> 

Thanks, I'll send v4

> 
>> +	return 0;
>> +}
>> +
>> +/**
>> + * Bulk implementation for Toeplitz hash.
>> + * Dummy implementation.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param m
>> + *  Pointer to the matrices generated from the corresponding
>> + *  RSS hash key using rte_thash_complete_matrix().
>> + * @param tuple
>> + *  Array of the pointers on data to be hashed.
>> + *  Data must be in network byte order.
>> + * @param len
>> + *  Length of the largest data buffer to be hashed.
>> + * @param val
>> + *  Array of uint32_t where to put calculated Toeplitz hash values
>> + * @param num
>> + *  Number of tuples to hash.
>> + */
>> +__rte_experimental
>> +static inline void
>> +rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
>> +	int len __rte_unused, uint8_t *tuple[] __rte_unused,
>> +	uint32_t val[], uint32_t num)
>> +{
>> +	unsigned int i;
>> +
>> +	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
>> +	for (i = 0; i < num; i++)
>> +		val[i] = 0;
>> +}
>> +
>> +#endif /* RTE_THASH_GFNI_DEFINED */
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_THASH_GFNI_H_ */
>> diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
>> new file mode 100644
>> index 0000000..faa340a
>> --- /dev/null
>> +++ b/lib/hash/rte_thash_x86_gfni.h
>> @@ -0,0 +1,221 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2021 Intel Corporation
>> + */
>> +
>> +#ifndef _RTE_THASH_X86_GFNI_H_
>> +#define _RTE_THASH_X86_GFNI_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * Optimized Toeplitz hash functions implementation
>> + * using Galois Fields New Instructions.
>> + */
>> +
>> +#include <rte_vect.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#ifdef __GFNI__
>> +#define RTE_THASH_GFNI_DEFINED
>> +
>> +#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
>> +#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
>> +#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
>> +#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
>> +#define RTE_THASH_REWIND_MSK		0x0000000000113377
>> +
>> +__rte_internal
>> +static inline void
>> +__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
>> +{
>> +	__m256i tmp_256_1, tmp_256_2;
>> +	__m128i tmp128_1, tmp128_2;
>> +	uint64_t tmp_1, tmp_2;
>> +
>> +	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
>> +	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
>> +	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
>> +
>> +	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
>> +	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
>> +	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
>> +
>> +	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
>> +	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
>> +	tmp_1 ^= tmp_2;
>> +
>> +	*val_1 = (uint32_t)tmp_1;
>> +	*val_2 = (uint32_t)(tmp_1 >> 32);
>> +}
>> +
>> +__rte_internal
>> +static inline __m512i
>> +__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
>> +	const uint8_t *secondary_tuple, int len)
>> +{
>> +	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
>> +						6, 5, 4, 3, 6, 5, 4, 3,
>> +						5, 4, 3, 2, 5, 4, 3, 2,
>> +						4, 3, 2, 1, 4, 3, 2, 1,
>> +						3, 2, 1, 0, 3, 2, 1, 0,
>> +						2, 1, 0, -1, 2, 1, 0, -1,
>> +						1, 0, -1, -2, 1, 0, -1, -2,
>> +						0, -1, -2, -3, 0, -1, -2, -3);
>> +
>> +	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 0, 0, 0, 0, 0,
>> +						0, 0, 0, 59, 0, 0, 0, 59,
>> +						0, 0, 59, 58, 0, 0, 59, 58,
>> +						0, 59, 58, 57, 0, 59, 58, 57);
>> +	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
>> +	const __m512i shift_8 = _mm512_set1_epi8(8);
>> +	__m512i xor_acc = _mm512_setzero_si512();
>> +	__m512i perm_bytes = _mm512_setzero_si512();
>> +	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
>> +	__mmask64 load_mask, permute_mask, permute_mask_2;
>> +	int chunk_len = 0, i = 0;
>> +	uint8_t mtrx_msk;
>> +	const int prepend = 3;
>> +
>> +	for (; len > 0; len -= 64, tuple += 64) {
>> +		if (i == 8)
>> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
>> +				rewind_idx, perm_bytes);
>> +
>> +		permute_mask = RTE_THASH_FIRST_ITER_MSK;
>> +		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
>> +		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
>> +		if (secondary_tuple) {
>> +			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
>> +			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
>> +				secondary_tuple);
>> +		}
>> +
>> +		chunk_len = __builtin_popcountll(load_mask);
>> +		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
>> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
>> +				permute_mask, permute_idx, tuple_bytes);
>> +
>> +			if (secondary_tuple)
>> +				perm_bytes =
>> +					_mm512_mask_permutexvar_epi8(perm_bytes,
>> +					permute_mask_2, permute_idx,
>> +					tuple_bytes_2);
>> +
>> +			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
>> +			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
>> +				matrixes, 0);
>> +
>> +			xor_acc = _mm512_xor_si512(xor_acc, vals);
>> +			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
>> +			permute_mask = RTE_THASH_PERM_MSK;
>> +			if (secondary_tuple)
>> +				permute_mask_2 = RTE_THASH_PERM_MSK_2;
>> +		}
>> +	}
>> +
>> +	int rest_len = (chunk_len + prepend) % 8;
>> +	if (rest_len != 0) {
>> +		mtrx_msk = (1 << (rest_len % 8)) - 1;
>> +		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
>> +		if (i == 8) {
>> +			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
>> +				rewind_idx, perm_bytes);
>> +		} else {
>> +			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
>> +				permute_mask, permute_idx, tuple_bytes);
>> +
>> +			if (secondary_tuple)
>> +				perm_bytes =
>> +					_mm512_mask_permutexvar_epi8(
>> +					perm_bytes, permute_mask_2,
>> +					permute_idx, tuple_bytes_2);
>> +		}
>> +
>> +		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
>> +		xor_acc = _mm512_xor_si512(xor_acc, vals);
>> +	}
>> +
>> +	return xor_acc;
>> +}
>> +
>> +/**
>> + * Calculate Toeplitz hash.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param m
>> + *  Pointer to the matrices generated from the corresponding
>> + *  RSS hash key using rte_thash_complete_matrix().
>> + * @param tuple
>> + *  Pointer to the data to be hashed. Data must be in network byte order.
>> + * @param len
>> + *  Length of the data to be hashed.
>> + * @return
>> + *  Calculated Toeplitz hash value.
>> + */
>> +__rte_experimental
>> +static inline uint32_t
>> +rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
>> +{
>> +	uint32_t val, val_zero;
>> +
>> +	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
>> +	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
>> +
>> +	return val;
>> +}
>> +
>> +/**
>> + * Bulk implementation for Toeplitz hash.
>> + *
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param m
>> + *  Pointer to the matrices generated from the corresponding
>> + *  RSS hash key using rte_thash_complete_matrix().
>> + * @param tuple
>> + *  Array of the pointers on data to be hashed.
>> + *  Data must be in network byte order.
>> + * @param len
>> + *  Length of the largest data buffer to be hashed.
>> + * @param val
>> + *  Array of uint32_t where to put calculated Toeplitz hash values
>> + * @param num
>> + *  Number of tuples to hash.
>> + */
>> +__rte_experimental
>> +static inline void
>> +rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
>> +	uint32_t val[], uint32_t num)
>> +{
>> +	uint32_t i;
>> +	uint32_t val_zero;
>> +	__m512i xor_acc;
>> +
>> +	for (i = 0; i != (num & ~1); i += 2) {
>> +		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
>> +		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
>> +	}
>> +
>> +	if (num & 1) {
>> +		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
>> +		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
>> +	}
>> +}
>> +
>> +#endif /* _GFNI_ */
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_THASH_X86_GFNI_H_ */
>> diff --git a/lib/hash/version.map b/lib/hash/version.map
>> index ce4309a..cecf922 100644
>> --- a/lib/hash/version.map
>> +++ b/lib/hash/version.map
>> @@ -39,10 +39,12 @@ EXPERIMENTAL {
>>   	rte_hash_rcu_qsbr_add;
>>   	rte_thash_add_helper;
>>   	rte_thash_adjust_tuple;
>> +	rte_thash_complete_matrix;
>>   	rte_thash_find_existing;
>>   	rte_thash_free_ctx;
>>   	rte_thash_get_complement;
>>   	rte_thash_get_helper;
>>   	rte_thash_get_key;
>> +	rte_thash_gfni_supported;
>>   	rte_thash_init_ctx;
>>   };
>> --
>> 2.7.4
> 

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v4 0/5] optimized Toeplitz hash implementation
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
@ 2021-10-21 17:18     ` Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 " Vladimir Medvedkin
                         ` (5 more replies)
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
                       ` (4 subsequent siblings)
  5 siblings, 6 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 17:18 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v4:
- included rte_log.h inside the rte_thash_gfni.h

v3:
- implementation moved to x86 specific header
- added rte_thash_gfni_supported() instead of the variable
- removed RTE_INIT section, due to adding rte_thash_gfni_supported()
- reworked rte_thash_complete_matrix() to make it easier tor read

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (5):
  hash: add new toeplitz hash implementation
  hash: enable gfni thash implementation
  doc/hash: update documentation for the thash library
  test/thash: add tests for a new Toeplitz hash function
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 237 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 120 ++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  37 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   1 +
 lib/hash/rte_thash.c                        |  71 ++++++++-
 lib/hash/rte_thash.h                        |  54 +++++++
 lib/hash/rte_thash_gfni.h                   |  87 ++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 221 ++++++++++++++++++++++++++
 lib/hash/version.map                        |   3 +
 12 files changed, 830 insertions(+), 8 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v4 1/5] hash: add new toeplitz hash implementation
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
@ 2021-10-21 17:18     ` Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
                       ` (3 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 17:18 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/api/doxy-api-index.md     |   1 +
 lib/hash/meson.build          |   1 +
 lib/hash/rte_thash.c          |  29 ++++++
 lib/hash/rte_thash.h          |  35 +++++++
 lib/hash/rte_thash_gfni.h     |  87 +++++++++++++++++
 lib/hash/rte_thash_x86_gfni.h | 221 ++++++++++++++++++++++++++++++++++++++++++
 lib/hash/version.map          |   2 +
 7 files changed, 376 insertions(+)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 1992107..7549477 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -139,6 +139,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..40444ac 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,6 +7,7 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
 )
 indirect_headers += files('rte_crc_arm64.h')
 
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..e605a6f 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,35 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 76109fc..a406be0 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -28,6 +28,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -223,6 +224,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..95718d1
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_log.h>
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+/**
+ * Bulk implementation for Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
+	int len __rte_unused, uint8_t *tuple[] __rte_unused,
+	uint32_t val[], uint32_t num)
+{
+	unsigned int i;
+
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	for (i = 0; i < num; i++)
+		val[i] = 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..faa340a
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+/**
+ * Bulk implementation for Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
+	uint32_t val[], uint32_t num)
+{
+	uint32_t i;
+	uint32_t val_zero;
+	__m512i xor_acc;
+
+	for (i = 0; i != (num & ~1); i += 2) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
+		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
+	}
+
+	if (num & 1) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
+		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
+	}
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index ce4309a..cecf922 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -39,10 +39,12 @@ EXPERIMENTAL {
 	rte_hash_rcu_qsbr_add;
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
+	rte_thash_complete_matrix;
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
 	rte_thash_get_helper;
 	rte_thash_get_key;
+	rte_thash_gfni_supported;
 	rte_thash_init_ctx;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v4 2/5] hash: enable gfni thash implementation
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-21 17:18     ` Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
                       ` (2 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 17:18 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index e605a6f..242d0ff 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -87,6 +87,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -266,12 +268,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported()) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -385,6 +403,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -641,6 +663,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -752,11 +780,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a406be0..d12ab81 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -423,6 +423,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index cecf922..3eda695 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -43,6 +43,7 @@ EXPERIMENTAL {
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
+	rte_thash_get_gfni_matrices;
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_gfni_supported;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v4 3/5] doc/hash: update documentation for the thash library
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
                       ` (2 preceding siblings ...)
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-21 17:18     ` Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 17:18 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds documentation for the new optimized Toeplitz hash
implementation using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
 doc/guides/rel_notes/release_21_11.rst      |  4 ++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..6f50a18 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
+* ``rte_thash_gfni_x2()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of abovementioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_x2()`` calculates for a two independent tuples in one go.
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrixes derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_gfni_x2()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* Two tuple pointers.
+* A length of the longest tuple in bytes.
+* Two pointers on the ``uint32_t`` to write results to.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 4c56cdf..5b53117 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -159,6 +159,10 @@ New Features
   * Added tests to verify tunnel header verification in IPsec inbound.
   * Added tests to verify inner checksum.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v4 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
                       ` (3 preceding siblings ...)
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
@ 2021-10-21 17:18     ` Vladimir Medvedkin
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 17:18 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch provides a set of tests for verifying the new
implementation of Toeplitz hash function using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/test_thash.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..a625306 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,210 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_BULK_DATA_BUF_1_HASH_IDX,
+	GFNI_BULK_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+	uint8_t *bulk_data[2];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < 2; i++)
+		bulk_data[i] = (uint8_t *)data[i];
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_bulk(rss_key_matrixes,
+			DATA_SZ * sizeof(uint32_t), bulk_data,
+			&hash[GFNI_BULK_DATA_BUF_1_HASH_IDX], 2);
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_toeplitz_hash_gfni_bulk(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss[2] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_bulk(rss_key_matrixes, RTE_THASH_V6_L4_LEN * 4,
+			tuples, rss, 2);
+
+		if ((rss[RSS_V4_IDX] != v4_tbl[i].hash_l3l4) ||
+				(rss[RSS_V6_IDX] != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +810,10 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_bulk),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v4 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
                       ` (4 preceding siblings ...)
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
@ 2021-10-21 17:18     ` Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 17:18 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds performance tests for different implementations
of the Toeplitz hash function.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index f144d8b..b9c4e78 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -141,6 +141,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -315,6 +316,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+	'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..fb66e20
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
+			bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v5 0/5] optimized Toeplitz hash implementation
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
@ 2021-10-21 18:54       ` Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz " Vladimir Medvedkin
                         ` (4 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 18:54 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v5:
- rebase on the latest main
- fix spelling

v4:
- included rte_log.h inside the rte_thash_gfni.h

v3:
- implementation moved to x86 specific header
- added rte_thash_gfni_supported() instead of the variable
- removed RTE_INIT section, due to adding rte_thash_gfni_supported()
- reworked rte_thash_complete_matrix() to make it easier tor read

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (5):
  hash: add new toeplitz hash implementation
  hash: enable gfni thash implementation
  doc/hash: update documentation for the thash library
  test/thash: add tests for a new Toeplitz hash function
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 237 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 120 ++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  37 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   1 +
 lib/hash/rte_thash.c                        |  71 ++++++++-
 lib/hash/rte_thash.h                        |  54 +++++++
 lib/hash/rte_thash_gfni.h                   |  87 ++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 221 ++++++++++++++++++++++++++
 lib/hash/version.map                        |   3 +
 12 files changed, 830 insertions(+), 8 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz hash implementation
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 " Vladimir Medvedkin
@ 2021-10-21 18:54       ` Vladimir Medvedkin
  2021-10-25 17:05         ` Thomas Monjalon
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
                         ` (3 subsequent siblings)
  5 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 18:54 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 doc/api/doxy-api-index.md     |   1 +
 lib/hash/meson.build          |   1 +
 lib/hash/rte_thash.c          |  29 ++++++
 lib/hash/rte_thash.h          |  35 +++++++
 lib/hash/rte_thash_gfni.h     |  87 +++++++++++++++++
 lib/hash/rte_thash_x86_gfni.h | 221 ++++++++++++++++++++++++++++++++++++++++++
 lib/hash/version.map          |   2 +
 7 files changed, 376 insertions(+)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 2939050..d9cecf3 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -140,6 +140,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..40444ac 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,6 +7,7 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
 )
 indirect_headers += files('rte_crc_arm64.h')
 
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..e605a6f 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,35 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 76109fc..a406be0 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -28,6 +28,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -223,6 +224,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..95718d1
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_log.h>
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+/**
+ * Bulk implementation for Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
+	int len __rte_unused, uint8_t *tuple[] __rte_unused,
+	uint32_t val[], uint32_t num)
+{
+	unsigned int i;
+
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	for (i = 0; i < num; i++)
+		val[i] = 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..faa340a
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+/**
+ * Bulk implementation for Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
+	uint32_t val[], uint32_t num)
+{
+	uint32_t i;
+	uint32_t val_zero;
+	__m512i xor_acc;
+
+	for (i = 0; i != (num & ~1); i += 2) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
+		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
+	}
+
+	if (num & 1) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
+		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
+	}
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 8185470..64bed86 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -39,10 +39,12 @@ EXPERIMENTAL {
 
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
+	rte_thash_complete_matrix;
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
 	rte_thash_get_helper;
 	rte_thash_get_key;
+	rte_thash_gfni_supported;
 	rte_thash_init_ctx;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v5 2/5] hash: enable gfni thash implementation
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 " Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-10-21 18:54       ` Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
                         ` (2 subsequent siblings)
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 18:54 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index e605a6f..242d0ff 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -87,6 +87,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -266,12 +268,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported()) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -385,6 +403,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -641,6 +663,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -752,11 +780,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a406be0..d12ab81 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -423,6 +423,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 64bed86..23d3e76 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -43,6 +43,7 @@ EXPERIMENTAL {
 	rte_thash_find_existing;
 	rte_thash_free_ctx;
 	rte_thash_get_complement;
+	rte_thash_get_gfni_matrices;
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_gfni_supported;
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
                         ` (2 preceding siblings ...)
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-21 18:54       ` Vladimir Medvedkin
  2021-10-25 17:04         ` Thomas Monjalon
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  5 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 18:54 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds documentation for the new optimized Toeplitz hash
implementation using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
 doc/guides/rel_notes/release_21_11.rst      |  4 ++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..88b152e 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
+* ``rte_thash_gfni_x2()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of above mentioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_x2()`` calculates for a two independent tuples in one go.
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_gfni_x2()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* Two tuple pointers.
+* A length of the longest tuple in bytes.
+* Two pointers on the ``uint32_t`` to write results to.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index a0ad309..d6f39fb 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -240,6 +240,10 @@ New Features
   * Added tests to verify tunnel header verification in IPsec inbound.
   * Added tests to verify inner checksum.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v5 4/5] test/thash: add tests for a new Toeplitz hash function
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
                         ` (3 preceding siblings ...)
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
@ 2021-10-21 18:54       ` Vladimir Medvedkin
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  5 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 18:54 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch provides a set of tests for verifying the new
implementation of Toeplitz hash function using GFNI.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/test_thash.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..a625306 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,210 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_BULK_DATA_BUF_1_HASH_IDX,
+	GFNI_BULK_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+	uint8_t *bulk_data[2];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < 2; i++)
+		bulk_data[i] = (uint8_t *)data[i];
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_bulk(rss_key_matrixes,
+			DATA_SZ * sizeof(uint32_t), bulk_data,
+			&hash[GFNI_BULK_DATA_BUF_1_HASH_IDX], 2);
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_toeplitz_hash_gfni_bulk(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss[2] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_bulk(rss_key_matrixes, RTE_THASH_V6_L4_LEN * 4,
+			tuples, rss, 2);
+
+		if ((rss[RSS_V4_IDX] != v4_tbl[i].hash_l3l4) ||
+				(rss[RSS_V6_IDX] != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +810,10 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_bulk),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
                         ` (4 preceding siblings ...)
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
@ 2021-10-21 18:54       ` Vladimir Medvedkin
  2021-10-25 17:02         ` Thomas Monjalon
  2021-10-25 17:27         ` Stephen Hemminger
  5 siblings, 2 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-21 18:54 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

This patch adds performance tests for different implementations
of the Toeplitz hash function.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index ba2600a..8b9e6e9 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -144,6 +144,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -320,6 +321,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+	'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..fb66e20
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
+			bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
@ 2021-10-25 17:02         ` Thomas Monjalon
  2021-10-26 20:29           ` Medvedkin, Vladimir
  2021-10-25 17:27         ` Stephen Hemminger
  1 sibling, 1 reply; 72+ messages in thread
From: Thomas Monjalon @ 2021-10-25 17:02 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

21/10/2021 20:54, Vladimir Medvedkin:
> This patch adds performance tests for different implementations
> of the Toeplitz hash function.

Please name them.

> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

There are some garbage,

> @@ -320,6 +321,7 @@ perf_test_names = [
>          'hash_readwrite_lf_perf_autotest',
>          'trace_perf_autotest',
>          'ipsec_perf_autotest',
> +	'thash_perf_autotest',

here (tabs instead of space)

>  driver_test_names = [
> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
> new file mode 100644
> index 0000000..fb66e20
> --- /dev/null
> +++ b/app/test/test_thash_perf.c
> @@ -0,0 +1,120 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2021 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <math.h>
> +
> +#include <rte_cycles.h>
> +#include <rte_malloc.h>
> +#include <rte_random.h>
> +#include <rte_thash.h>
> +
> +#include "test.h"
> +
> +#define ITERATIONS	(1 << 15)
> +#define	BATCH_SZ	(1 << 10)
> +
> +#define IPV4_2_TUPLE_LEN	(8)
> +#define IPV4_4_TUPLE_LEN	(12)
> +#define IPV6_2_TUPLE_LEN	(32)
> +#define IPV6_4_TUPLE_LEN	(36)
> +
> +
> +static uint8_t default_rss_key[] = {
> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> +};
> +
> +static void
> +run_thash_test(unsigned int tuple_len)
> +{
> +	uint32_t *tuples[BATCH_SZ];
> +	unsigned int i, j;
> +	uint64_t start_tsc, end_tsc;
> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
> +	volatile uint32_t hash = 0;
> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
> +
> +	for (i = 0; i < BATCH_SZ; i++) {
> +		tuples[i] = rte_zmalloc(NULL, len, 0);
> +		for (j = 0; j < len / sizeof(uint32_t); j++)
> +			tuples[i][j] = rte_rand();
> +	}
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++) {
> +		for (j = 0; j < BATCH_SZ; j++) {
> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
> +				default_rss_key);
> +		}
> +	}
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++) {
> +		for (j = 0; j < BATCH_SZ; j++) {
> +			hash ^= rte_softrss_be(tuples[j], len /
> +				sizeof(uint32_t), default_rss_key);
> +		}
> +	}
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);

The function could stop here (one function per type of implementation).

> +
> +	if (!rte_thash_gfni_supported())
> +		return;
> +
> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
> +
> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
> +		RTE_DIM(default_rss_key));
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++) {
> +		for (j = 0; j < BATCH_SZ; j++)
> +			hash ^= rte_thash_gfni(rss_key_matrixes,
> +				(uint8_t *)tuples[j], len);
> +	}
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);
> +
> +	start_tsc = rte_rdtsc_precise();
> +	for (i = 0; i < ITERATIONS; i++)
> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
> +			bulk_hash, BATCH_SZ);
> +
> +	end_tsc = rte_rdtsc_precise();
> +
> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",

and here, the function name is not updated.

> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> +		BATCH_SZ), len);
> +

useless blank line

> +}




^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
@ 2021-10-25 17:04         ` Thomas Monjalon
  2021-10-26 20:30           ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Thomas Monjalon @ 2021-10-25 17:04 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

Vladimir, your patches are late and not perfect.
You need reviews. Please ask other maintainers to help with reviews.


21/10/2021 20:54, Vladimir Medvedkin:
> This patch adds documentation for the new optimized Toeplitz hash
> implementation using GFNI.
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
>  doc/guides/rel_notes/release_21_11.rst      |  4 ++++
>  2 files changed, 37 insertions(+), 4 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
> index f916857..88b152e 100644
> --- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
> +++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
> @@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
>  Toeplitz hash function API
>  --------------------------
>  
> -There are two functions that provide calculation of the Toeplitz hash sum:
> +There are four functions that provide calculation of the Toeplitz hash sum:
>  
>  * ``rte_softrss()``
>  * ``rte_softrss_be()``
> +* ``rte_thash_gfni()``
> +* ``rte_thash_gfni_x2()``

The last function doesn't exist. I think it should be the _bulk one.

Also please squash the doc and test with the relevant code addition.
Maybe 2 patches for each implementation?



^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz hash implementation
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-10-25 17:05         ` Thomas Monjalon
  0 siblings, 0 replies; 72+ messages in thread
From: Thomas Monjalon @ 2021-10-25 17:05 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

21/10/2021 20:54, Vladimir Medvedkin:
> This patch add a new Toeplitz hash implementation using
> Galios Fields New Instructions (GFNI).
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
> --- a/lib/hash/version.map
> +++ b/lib/hash/version.map
> @@ -39,10 +39,12 @@ EXPERIMENTAL {
>  
>  	rte_thash_add_helper;
>  	rte_thash_adjust_tuple;
> +	rte_thash_complete_matrix;
>  	rte_thash_find_existing;
>  	rte_thash_free_ctx;
>  	rte_thash_get_complement;
>  	rte_thash_get_helper;
>  	rte_thash_get_key;
> +	rte_thash_gfni_supported;
>  	rte_thash_init_ctx;
>  };
> 

It should be like this:

--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -37,6 +37,7 @@ DPDK_22 {
 EXPERIMENTAL {
        global:
 
+       # added in 21.05
        rte_thash_add_helper;
        rte_thash_adjust_tuple;
        rte_thash_find_existing;
@@ -45,4 +46,8 @@ EXPERIMENTAL {
        rte_thash_get_helper;
        rte_thash_get_key;
        rte_thash_init_ctx;
+
+       # added in 21.11
+       rte_thash_complete_matrix;
+       rte_thash_gfni_supported;
 };





^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  2021-10-25 17:02         ` Thomas Monjalon
@ 2021-10-25 17:27         ` Stephen Hemminger
  2021-10-26 20:31           ` Medvedkin, Vladimir
  1 sibling, 1 reply; 72+ messages in thread
From: Stephen Hemminger @ 2021-10-25 17:27 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson, konstantin.ananyev

On Thu, 21 Oct 2021 19:54:29 +0100
Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:

> +static uint8_t default_rss_key[] = {

Should this be const?

That way you can make sure API isn't modifying it.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-25 17:02         ` Thomas Monjalon
@ 2021-10-26 20:29           ` Medvedkin, Vladimir
  2021-10-27  8:29             ` Thomas Monjalon
  0 siblings, 1 reply; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-26 20:29 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

Hi Thomas,

Thanks for the review, I'll address your comments in v6.
Please find my comment below

On 25/10/2021 19:02, Thomas Monjalon wrote:
> 21/10/2021 20:54, Vladimir Medvedkin:
>> This patch adds performance tests for different implementations
>> of the Toeplitz hash function.
> 
> Please name them.
> 
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> 
> There are some garbage,
> 
>> @@ -320,6 +321,7 @@ perf_test_names = [
>>           'hash_readwrite_lf_perf_autotest',
>>           'trace_perf_autotest',
>>           'ipsec_perf_autotest',
>> +	'thash_perf_autotest',
> 
> here (tabs instead of space)
> 
>>   driver_test_names = [
>> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
>> new file mode 100644
>> index 0000000..fb66e20
>> --- /dev/null
>> +++ b/app/test/test_thash_perf.c
>> @@ -0,0 +1,120 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2021 Intel Corporation
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdint.h>
>> +#include <stdlib.h>
>> +#include <math.h>
>> +
>> +#include <rte_cycles.h>
>> +#include <rte_malloc.h>
>> +#include <rte_random.h>
>> +#include <rte_thash.h>
>> +
>> +#include "test.h"
>> +
>> +#define ITERATIONS	(1 << 15)
>> +#define	BATCH_SZ	(1 << 10)
>> +
>> +#define IPV4_2_TUPLE_LEN	(8)
>> +#define IPV4_4_TUPLE_LEN	(12)
>> +#define IPV6_2_TUPLE_LEN	(32)
>> +#define IPV6_4_TUPLE_LEN	(36)
>> +
>> +
>> +static uint8_t default_rss_key[] = {
>> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>> +};
>> +
>> +static void
>> +run_thash_test(unsigned int tuple_len)
>> +{
>> +	uint32_t *tuples[BATCH_SZ];
>> +	unsigned int i, j;
>> +	uint64_t start_tsc, end_tsc;
>> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
>> +	volatile uint32_t hash = 0;
>> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
>> +
>> +	for (i = 0; i < BATCH_SZ; i++) {
>> +		tuples[i] = rte_zmalloc(NULL, len, 0);
>> +		for (j = 0; j < len / sizeof(uint32_t); j++)
>> +			tuples[i][j] = rte_rand();
>> +	}
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		for (j = 0; j < BATCH_SZ; j++) {
>> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
>> +				default_rss_key);
>> +		}
>> +	}
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		for (j = 0; j < BATCH_SZ; j++) {
>> +			hash ^= rte_softrss_be(tuples[j], len /
>> +				sizeof(uint32_t), default_rss_key);
>> +		}
>> +	}
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
> 
> The function could stop here (one function per type of implementation).
> 

Could you please clarify what do you mean?
The function stops here if the machine do not support GFNI, and this is 
done intentionally. On machine without GFNI it tests only scalar 
implementations for every given length.

>> +
>> +	if (!rte_thash_gfni_supported())
>> +		return;
>> +
>> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
>> +
>> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
>> +		RTE_DIM(default_rss_key));
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		for (j = 0; j < BATCH_SZ; j++)
>> +			hash ^= rte_thash_gfni(rss_key_matrixes,
>> +				(uint8_t *)tuples[j], len);
>> +	}
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
>> +
>> +	start_tsc = rte_rdtsc_precise();
>> +	for (i = 0; i < ITERATIONS; i++)
>> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
>> +			bulk_hash, BATCH_SZ);
>> +
>> +	end_tsc = rte_rdtsc_precise();
>> +
>> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
> 
> and here, the function name is not updated.
> 
>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>> +		BATCH_SZ), len);
>> +
> 
> useless blank line
> 
>> +}
> 
> 
> 

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library
  2021-10-25 17:04         ` Thomas Monjalon
@ 2021-10-26 20:30           ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-26 20:30 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen



On 25/10/2021 19:04, Thomas Monjalon wrote:
> Vladimir, your patches are late and not perfect.
> You need reviews. Please ask other maintainers to help with reviews.
> 
> 
> 21/10/2021 20:54, Vladimir Medvedkin:
>> This patch adds documentation for the new optimized Toeplitz hash
>> implementation using GFNI.
>>
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>> ---
>>   doc/guides/prog_guide/toeplitz_hash_lib.rst | 37 +++++++++++++++++++++++++----
>>   doc/guides/rel_notes/release_21_11.rst      |  4 ++++
>>   2 files changed, 37 insertions(+), 4 deletions(-)
>>
>> diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
>> index f916857..88b152e 100644
>> --- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
>> +++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
>> @@ -19,24 +19,53 @@ to calculate the RSS hash sum to spread the traffic among the queues.
>>   Toeplitz hash function API
>>   --------------------------
>>   
>> -There are two functions that provide calculation of the Toeplitz hash sum:
>> +There are four functions that provide calculation of the Toeplitz hash sum:
>>   
>>   * ``rte_softrss()``
>>   * ``rte_softrss_be()``
>> +* ``rte_thash_gfni()``
>> +* ``rte_thash_gfni_x2()``
> 
> The last function doesn't exist. I think it should be the _bulk one.
> 
> Also please squash the doc and test with the relevant code addition.
> Maybe 2 patches for each implementation?
> 

Good, I'll send v6

> 

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-25 17:27         ` Stephen Hemminger
@ 2021-10-26 20:31           ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-26 20:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson, konstantin.ananyev

Hi Stephen,

On 25/10/2021 19:27, Stephen Hemminger wrote:
> On Thu, 21 Oct 2021 19:54:29 +0100
> Vladimir Medvedkin <vladimir.medvedkin@intel.com> wrote:
> 
>> +static uint8_t default_rss_key[] = {
> 
> Should this be const?
> 
> That way you can make sure API isn't modifying it.
> 

Thanks, I'll fix this in v6


-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] optimized Toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (10 preceding siblings ...)
  2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
@ 2021-10-26 20:32 ` Vladimir Medvedkin
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 1/4] hash: add new toeplitz " Vladimir Medvedkin
                   ` (13 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-26 20:32 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v6:
- addressed Thomas and Stephen's comments
- squash the doc and test commith with the relevant code addition
- split for 2 patches, each per implementation - single and bulk

v5:
- rebase on the latest main
- fix spelling

v4:
- included rte_log.h inside the rte_thash_gfni.h

v3:
- implementation moved to x86 specific header
- added rte_thash_gfni_supported() instead of the variable
- removed RTE_INIT section, due to adding rte_thash_gfni_supported()
- reworked rte_thash_complete_matrix() to make it easier tor read

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (4):
  hash: add new toeplitz hash implementation
  hash: add bulk toeplitz hash implementation
  hash: enable gfni thash implementation
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 237 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 119 ++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  38 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  71 ++++++++-
 lib/hash/rte_thash.h                        |  54 +++++++
 lib/hash/rte_thash_gfni.h                   |  87 ++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 221 ++++++++++++++++++++++++++
 lib/hash/version.map                        |   6 +
 12 files changed, 837 insertions(+), 9 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] hash: add new toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (11 preceding siblings ...)
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-26 20:32 ` Vladimir Medvedkin
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 2/4] hash: add bulk " Vladimir Medvedkin
                   ` (12 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-26 20:32 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 172 ++++++++++++++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  28 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  29 +++++
 lib/hash/rte_thash.h                        |  35 ++++++
 lib/hash/rte_thash_gfni.h                   |  54 +++++++++
 lib/hash/rte_thash_x86_gfni.h               | 182 ++++++++++++++++++++++++++++
 lib/hash/version.map                        |   5 +
 10 files changed, 511 insertions(+), 5 deletions(-)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..22d784e 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,146 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +746,9 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 49892a3..4245b96 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -142,6 +142,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..acdd8c3 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,44 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are three functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of above mentioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last function is vectorized implementation using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
+It expects the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 1ccac87..4daeb4a 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -305,6 +305,10 @@ New Features
     * Pcapng format with timestamps and meta-data.
     * Fixes packet capture with stripped VLAN tags.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..12b1afc 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,8 +7,12 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
+)
+indirect_headers += files(
+        'rte_crc_arm64.h',
+        'rte_thash_x86_gfni.h',
 )
-indirect_headers += files('rte_crc_arm64.h')
 
 sources = files('rte_cuckoo_hash.c', 'rte_fbk_hash.c', 'rte_thash.c')
 deps += ['net']
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..e605a6f 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,35 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a26fe56..40146cf 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -24,6 +24,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -219,6 +220,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..bbacd41
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_log.h>
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..1cb7353
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 8185470..153ab87 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -37,6 +37,7 @@ DPDK_22 {
 EXPERIMENTAL {
 	global:
 
+	#added in 21.05
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
 	rte_thash_find_existing;
@@ -45,4 +46,8 @@ EXPERIMENTAL {
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_init_ctx;
+
+	#added in 21.11
+	rte_thash_complete_matrix;
+	rte_thash_gfni_supported;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] hash: add bulk toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (12 preceding siblings ...)
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 1/4] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-10-26 20:32 ` Vladimir Medvedkin
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
                   ` (11 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-26 20:32 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch adds a bulk version for the Toeplitz hash implemented
with Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 67 ++++++++++++++++++++++++++++-
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 20 ++++++---
 lib/hash/rte_thash_gfni.h                   | 33 ++++++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 39 +++++++++++++++++
 4 files changed, 153 insertions(+), 6 deletions(-)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index 22d784e..a625306 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -230,6 +230,8 @@ enum {
 	SCALAR_DATA_BUF_2_HASH_IDX,
 	GFNI_DATA_BUF_1_HASH_IDX,
 	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_BULK_DATA_BUF_1_HASH_IDX,
+	GFNI_BULK_DATA_BUF_2_HASH_IDX,
 	HASH_IDXES
 };
 
@@ -241,6 +243,7 @@ test_toeplitz_hash_rand_data(void)
 	uint32_t hash[HASH_IDXES] = { 0 };
 	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
 	int i, j;
+	uint8_t *bulk_data[2];
 
 	if (!rte_thash_gfni_supported())
 		return TEST_SKIPPED;
@@ -248,6 +251,9 @@ test_toeplitz_hash_rand_data(void)
 	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
 		RTE_DIM(default_rss_key));
 
+	for (i = 0; i < 2; i++)
+		bulk_data[i] = (uint8_t *)data[i];
+
 	for (i = 0; i < ITER; i++) {
 		for (j = 0; j < DATA_SZ; j++) {
 			data[0][j] = rte_rand();
@@ -266,11 +272,18 @@ test_toeplitz_hash_rand_data(void)
 		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
 			rss_key_matrixes, (uint8_t *)data[1],
 			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_bulk(rss_key_matrixes,
+			DATA_SZ * sizeof(uint32_t), bulk_data,
+			&hash[GFNI_BULK_DATA_BUF_1_HASH_IDX], 2);
 
 		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
 				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_1_HASH_IDX]) ||
 				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
-				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_2_HASH_IDX]))
 
 			return -TEST_FAILED;
 	}
@@ -284,6 +297,57 @@ enum {
 };
 
 static int
+test_toeplitz_hash_gfni_bulk(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss[2] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_bulk(rss_key_matrixes, RTE_THASH_V6_L4_LEN * 4,
+			tuples, rss, 2);
+
+		if ((rss[RSS_V4_IDX] != v4_tbl[i].hash_l3l4) ||
+				(rss[RSS_V6_IDX] != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_big_tuple_gfni(void)
 {
 	uint32_t arr[16];
@@ -748,6 +812,7 @@ static struct unit_test_suite thash_tests = {
 	TEST_CASE(test_toeplitz_hash_calc),
 	TEST_CASE(test_toeplitz_hash_gfni),
 	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_bulk),
 	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index acdd8c3..61eaafd 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,11 +19,12 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are three functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
 * ``rte_thash_gfni()``
+* ``rte_thash_gfni_bulk()``
 
 First two functions are scalar implementation and take the parameters:
 
@@ -38,11 +39,12 @@ to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
-The last function is vectorized implementation using
-Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
-It expects the tuple to be in network byte order.
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
 
-``rte_thash_gfni()`` calculates the hash value for a single tuple
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_bulk()`` bulk implementation of the rte_thash_gfni().
 
 ``rte_thash_gfni()`` takes the parameters:
 
@@ -50,6 +52,14 @@ It expects the tuple to be in network byte order.
 * A pointer to the tuple.
 * A length of the tuple in bytes.
 
+``rte_thash_gfni_bulk()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A length of the longest tuple in bytes.
+* Array of the pointers on data to be hashed.
+* Array of ``uint32_t`` where to put calculated Toeplitz hash values
+* Number of tuples in a bulk.
+
 ``rte_thash_complete_matrix()`` is a function that calculates matrices required by
 GFNI implementations from the RSS hash key. It takes the parameters:
 
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
index bbacd41..e97d912 100644
--- a/lib/hash/rte_thash_gfni.h
+++ b/lib/hash/rte_thash_gfni.h
@@ -45,6 +45,39 @@ rte_thash_gfni(const uint64_t *mtrx __rte_unused,
 	return 0;
 }
 
+/**
+ * Bulk implementation for Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
+	int len __rte_unused, uint8_t *tuple[] __rte_unused,
+	uint32_t val[], uint32_t num)
+{
+	unsigned int i;
+
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	for (i = 0; i < num; i++)
+		val[i] = 0;
+}
+
 #endif /* RTE_THASH_GFNI_DEFINED */
 
 #ifdef __cplusplus
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
index 1cb7353..7bfb937 100644
--- a/lib/hash/rte_thash_x86_gfni.h
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -173,6 +173,45 @@ rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
 	return val;
 }
 
+/**
+ * Bulk implementation for Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
+	uint32_t val[], uint32_t num)
+{
+	uint32_t i;
+	uint32_t val_zero;
+	__m512i xor_acc;
+
+	for (i = 0; i != (num & ~1); i += 2) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
+		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
+	}
+
+	if (num & 1) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
+		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
+	}
+}
+
 #endif /* _GFNI_ */
 
 #ifdef __cplusplus
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] hash: enable gfni thash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (13 preceding siblings ...)
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 2/4] hash: add bulk " Vladimir Medvedkin
@ 2021-10-26 20:32 ` Vladimir Medvedkin
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
                   ` (10 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-26 20:32 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index e605a6f..242d0ff 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -87,6 +87,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -266,12 +268,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported()) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -385,6 +403,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -641,6 +663,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -752,11 +780,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 40146cf..c11ca0d 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -419,6 +419,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 153ab87..705c3f3 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -49,5 +49,6 @@ EXPERIMENTAL {
 
 	#added in 21.11
 	rte_thash_complete_matrix;
+	rte_thash_get_gfni_matrices;
 	rte_thash_gfni_supported;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] test/thash: add performance tests for the Toeplitz hash
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (14 preceding siblings ...)
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-26 20:32 ` Vladimir Medvedkin
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (9 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-26 20:32 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch adds performance tests for the following Toeplitz hash
function implementations:
  Scalar:
    - rte_softrss()
    - rte_softrss_be()
  Vector using gfni:
    - rte_thash_gfni()
    - rte_thash_gfni_bulk()

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 119 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 20f36a1..913e8f6 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -144,6 +144,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -322,6 +323,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+        'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..5454b69
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define	BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static const uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint64_t start_tsc, end_tsc;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	volatile uint32_t hash = 0;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
+				default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			hash ^= rte_softrss_be(tuples[j], len /
+				sizeof(uint32_t), default_rss_key);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++)
+			hash ^= rte_thash_gfni(rss_key_matrixes,
+				(uint8_t *)tuples[j], len);
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
+			bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	printf("Average rte_thash_gfni_bulk takes \t%.1f cycles for key len %d\n",
+		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
+		BATCH_SZ), len);
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-26 20:29           ` Medvedkin, Vladimir
@ 2021-10-27  8:29             ` Thomas Monjalon
  2021-10-27 15:48               ` Medvedkin, Vladimir
  0 siblings, 1 reply; 72+ messages in thread
From: Thomas Monjalon @ 2021-10-27  8:29 UTC (permalink / raw)
  To: Medvedkin, Vladimir
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

26/10/2021 22:29, Medvedkin, Vladimir:
> Hi Thomas,
> 
> Thanks for the review, I'll address your comments in v6.
> Please find my comment below
> 
> On 25/10/2021 19:02, Thomas Monjalon wrote:
> > 21/10/2021 20:54, Vladimir Medvedkin:
> >> This patch adds performance tests for different implementations
> >> of the Toeplitz hash function.
> > 
> > Please name them.
> > 
> >> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> > 
> > There are some garbage,
> > 
> >> @@ -320,6 +321,7 @@ perf_test_names = [
> >>           'hash_readwrite_lf_perf_autotest',
> >>           'trace_perf_autotest',
> >>           'ipsec_perf_autotest',
> >> +	'thash_perf_autotest',
> > 
> > here (tabs instead of space)
> > 
> >>   driver_test_names = [
> >> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
> >> new file mode 100644
> >> index 0000000..fb66e20
> >> --- /dev/null
> >> +++ b/app/test/test_thash_perf.c
> >> @@ -0,0 +1,120 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2021 Intel Corporation
> >> + */
> >> +
> >> +#include <stdio.h>
> >> +#include <stdint.h>
> >> +#include <stdlib.h>
> >> +#include <math.h>
> >> +
> >> +#include <rte_cycles.h>
> >> +#include <rte_malloc.h>
> >> +#include <rte_random.h>
> >> +#include <rte_thash.h>
> >> +
> >> +#include "test.h"
> >> +
> >> +#define ITERATIONS	(1 << 15)
> >> +#define	BATCH_SZ	(1 << 10)
> >> +
> >> +#define IPV4_2_TUPLE_LEN	(8)
> >> +#define IPV4_4_TUPLE_LEN	(12)
> >> +#define IPV6_2_TUPLE_LEN	(32)
> >> +#define IPV6_4_TUPLE_LEN	(36)
> >> +
> >> +
> >> +static uint8_t default_rss_key[] = {
> >> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
> >> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
> >> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
> >> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
> >> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
> >> +};
> >> +
> >> +static void
> >> +run_thash_test(unsigned int tuple_len)
> >> +{
> >> +	uint32_t *tuples[BATCH_SZ];
> >> +	unsigned int i, j;
> >> +	uint64_t start_tsc, end_tsc;
> >> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
> >> +	volatile uint32_t hash = 0;
> >> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
> >> +
> >> +	for (i = 0; i < BATCH_SZ; i++) {
> >> +		tuples[i] = rte_zmalloc(NULL, len, 0);
> >> +		for (j = 0; j < len / sizeof(uint32_t); j++)
> >> +			tuples[i][j] = rte_rand();
> >> +	}
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		for (j = 0; j < BATCH_SZ; j++) {
> >> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
> >> +				default_rss_key);
> >> +		}
> >> +	}
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		for (j = 0; j < BATCH_SZ; j++) {
> >> +			hash ^= rte_softrss_be(tuples[j], len /
> >> +				sizeof(uint32_t), default_rss_key);
> >> +		}
> >> +	}
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> > 
> > The function could stop here (one function per type of implementation).
> > 
> 
> Could you please clarify what do you mean?
> The function stops here if the machine do not support GFNI, and this is 
> done intentionally. On machine without GFNI it tests only scalar 
> implementations for every given length.

No I mean you can split in smaller functions.

> >> +
> >> +	if (!rte_thash_gfni_supported())
> >> +		return;
> >> +
> >> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
> >> +
> >> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
> >> +		RTE_DIM(default_rss_key));
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++) {
> >> +		for (j = 0; j < BATCH_SZ; j++)
> >> +			hash ^= rte_thash_gfni(rss_key_matrixes,
> >> +				(uint8_t *)tuples[j], len);
> >> +	}
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> >> +
> >> +	start_tsc = rte_rdtsc_precise();
> >> +	for (i = 0; i < ITERATIONS; i++)
> >> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
> >> +			bulk_hash, BATCH_SZ);
> >> +
> >> +	end_tsc = rte_rdtsc_precise();
> >> +
> >> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
> > 
> > and here, the function name is not updated.
> > 
> >> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
> >> +		BATCH_SZ), len);
> >> +
> > 
> > useless blank line
> > 
> >> +}




^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash
  2021-10-27  8:29             ` Thomas Monjalon
@ 2021-10-27 15:48               ` Medvedkin, Vladimir
  0 siblings, 0 replies; 72+ messages in thread
From: Medvedkin, Vladimir @ 2021-10-27 15:48 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

Hi Thomas,

On 27/10/2021 10:29, Thomas Monjalon wrote:
> 26/10/2021 22:29, Medvedkin, Vladimir:
>> Hi Thomas,
>>
>> Thanks for the review, I'll address your comments in v6.
>> Please find my comment below
>>
>> On 25/10/2021 19:02, Thomas Monjalon wrote:
>>> 21/10/2021 20:54, Vladimir Medvedkin:
>>>> This patch adds performance tests for different implementations
>>>> of the Toeplitz hash function.
>>>
>>> Please name them.
>>>
>>>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>>>
>>> There are some garbage,
>>>
>>>> @@ -320,6 +321,7 @@ perf_test_names = [
>>>>            'hash_readwrite_lf_perf_autotest',
>>>>            'trace_perf_autotest',
>>>>            'ipsec_perf_autotest',
>>>> +	'thash_perf_autotest',
>>>
>>> here (tabs instead of space)
>>>
>>>>    driver_test_names = [
>>>> diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
>>>> new file mode 100644
>>>> index 0000000..fb66e20
>>>> --- /dev/null
>>>> +++ b/app/test/test_thash_perf.c
>>>> @@ -0,0 +1,120 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2021 Intel Corporation
>>>> + */
>>>> +
>>>> +#include <stdio.h>
>>>> +#include <stdint.h>
>>>> +#include <stdlib.h>
>>>> +#include <math.h>
>>>> +
>>>> +#include <rte_cycles.h>
>>>> +#include <rte_malloc.h>
>>>> +#include <rte_random.h>
>>>> +#include <rte_thash.h>
>>>> +
>>>> +#include "test.h"
>>>> +
>>>> +#define ITERATIONS	(1 << 15)
>>>> +#define	BATCH_SZ	(1 << 10)
>>>> +
>>>> +#define IPV4_2_TUPLE_LEN	(8)
>>>> +#define IPV4_4_TUPLE_LEN	(12)
>>>> +#define IPV6_2_TUPLE_LEN	(32)
>>>> +#define IPV6_4_TUPLE_LEN	(36)
>>>> +
>>>> +
>>>> +static uint8_t default_rss_key[] = {
>>>> +	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
>>>> +	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
>>>> +	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
>>>> +	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
>>>> +	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
>>>> +};
>>>> +
>>>> +static void
>>>> +run_thash_test(unsigned int tuple_len)
>>>> +{
>>>> +	uint32_t *tuples[BATCH_SZ];
>>>> +	unsigned int i, j;
>>>> +	uint64_t start_tsc, end_tsc;
>>>> +	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
>>>> +	volatile uint32_t hash = 0;
>>>> +	uint32_t bulk_hash[BATCH_SZ] = { 0 };
>>>> +
>>>> +	for (i = 0; i < BATCH_SZ; i++) {
>>>> +		tuples[i] = rte_zmalloc(NULL, len, 0);
>>>> +		for (j = 0; j < len / sizeof(uint32_t); j++)
>>>> +			tuples[i][j] = rte_rand();
>>>> +	}
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++) {
>>>> +		for (j = 0; j < BATCH_SZ; j++) {
>>>> +			hash ^= rte_softrss(tuples[j], len / sizeof(uint32_t),
>>>> +				default_rss_key);
>>>> +		}
>>>> +	}
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++) {
>>>> +		for (j = 0; j < BATCH_SZ; j++) {
>>>> +			hash ^= rte_softrss_be(tuples[j], len /
>>>> +				sizeof(uint32_t), default_rss_key);
>>>> +		}
>>>> +	}
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>
>>> The function could stop here (one function per type of implementation).
>>>
>>
>> Could you please clarify what do you mean?
>> The function stops here if the machine do not support GFNI, and this is
>> done intentionally. On machine without GFNI it tests only scalar
>> implementations for every given length.
> 
> No I mean you can split in smaller functions.
> 

Aha, I see, I'll send v7.

>>>> +
>>>> +	if (!rte_thash_gfni_supported())
>>>> +		return;
>>>> +
>>>> +	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
>>>> +
>>>> +	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
>>>> +		RTE_DIM(default_rss_key));
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++) {
>>>> +		for (j = 0; j < BATCH_SZ; j++)
>>>> +			hash ^= rte_thash_gfni(rss_key_matrixes,
>>>> +				(uint8_t *)tuples[j], len);
>>>> +	}
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>> +
>>>> +	start_tsc = rte_rdtsc_precise();
>>>> +	for (i = 0; i < ITERATIONS; i++)
>>>> +		rte_thash_gfni_bulk(rss_key_matrixes, len, (uint8_t **)tuples,
>>>> +			bulk_hash, BATCH_SZ);
>>>> +
>>>> +	end_tsc = rte_rdtsc_precise();
>>>> +
>>>> +	printf("Average rte_thash_gfni_x2 takes \t%.1f cycles for key len %d\n",
>>>
>>> and here, the function name is not updated.
>>>
>>>> +		(double)(end_tsc - start_tsc) / (double)(ITERATIONS *
>>>> +		BATCH_SZ), len);
>>>> +
>>>
>>> useless blank line
>>>
>>>> +}
> 
> 
> 

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v7 0/4] optimized Toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (15 preceding siblings ...)
  2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
@ 2021-10-27 16:16 ` Vladimir Medvedkin
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 1/4] hash: add new toeplitz " Vladimir Medvedkin
                   ` (8 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-27 16:16 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v7:
- reworked performance tests code

v6:
- addressed Thomas and Stephen's comments
- squash the doc and test commith with the relevant code addition
- split for 2 patches, each per implementation - single and bulk

v5:
- rebase on the latest main
- fix spelling

v4:
- included rte_log.h inside the rte_thash_gfni.h

v3:
- implementation moved to x86 specific header
- added rte_thash_gfni_supported() instead of the variable
- removed RTE_INIT section, due to adding rte_thash_gfni_supported()
- reworked rte_thash_complete_matrix() to make it easier tor read

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (4):
  hash: add new toeplitz hash implementation
  hash: add bulk toeplitz hash implementation
  hash: enable gfni thash implementation
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 237 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 135 ++++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  38 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  71 ++++++++-
 lib/hash/rte_thash.h                        |  54 +++++++
 lib/hash/rte_thash_gfni.h                   |  87 ++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 221 ++++++++++++++++++++++++++
 lib/hash/version.map                        |   6 +
 12 files changed, 853 insertions(+), 9 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v7 1/4] hash: add new toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (16 preceding siblings ...)
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-10-27 16:16 ` Vladimir Medvedkin
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 2/4] hash: add bulk " Vladimir Medvedkin
                   ` (7 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-27 16:16 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 172 ++++++++++++++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  28 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  29 +++++
 lib/hash/rte_thash.h                        |  35 ++++++
 lib/hash/rte_thash_gfni.h                   |  54 +++++++++
 lib/hash/rte_thash_x86_gfni.h               | 182 ++++++++++++++++++++++++++++
 lib/hash/version.map                        |   5 +
 10 files changed, 511 insertions(+), 5 deletions(-)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..22d784e 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,146 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +746,9 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 49892a3..4245b96 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -142,6 +142,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..acdd8c3 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,44 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are three functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of above mentioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last function is vectorized implementation using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
+It expects the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 1ccac87..4daeb4a 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -305,6 +305,10 @@ New Features
     * Pcapng format with timestamps and meta-data.
     * Fixes packet capture with stripped VLAN tags.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..12b1afc 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,8 +7,12 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
+)
+indirect_headers += files(
+        'rte_crc_arm64.h',
+        'rte_thash_x86_gfni.h',
 )
-indirect_headers += files('rte_crc_arm64.h')
 
 sources = files('rte_cuckoo_hash.c', 'rte_fbk_hash.c', 'rte_thash.c')
 deps += ['net']
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..e605a6f 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,35 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a26fe56..40146cf 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -24,6 +24,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -219,6 +220,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..bbacd41
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_log.h>
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..1cb7353
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 8185470..153ab87 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -37,6 +37,7 @@ DPDK_22 {
 EXPERIMENTAL {
 	global:
 
+	#added in 21.05
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
 	rte_thash_find_existing;
@@ -45,4 +46,8 @@ EXPERIMENTAL {
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_init_ctx;
+
+	#added in 21.11
+	rte_thash_complete_matrix;
+	rte_thash_gfni_supported;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v7 2/4] hash: add bulk toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (17 preceding siblings ...)
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 1/4] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-10-27 16:16 ` Vladimir Medvedkin
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
                   ` (6 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-27 16:16 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch adds a bulk version for the Toeplitz hash implemented
with Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 67 ++++++++++++++++++++++++++++-
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 20 ++++++---
 lib/hash/rte_thash_gfni.h                   | 33 ++++++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 39 +++++++++++++++++
 4 files changed, 153 insertions(+), 6 deletions(-)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index 22d784e..a625306 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -230,6 +230,8 @@ enum {
 	SCALAR_DATA_BUF_2_HASH_IDX,
 	GFNI_DATA_BUF_1_HASH_IDX,
 	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_BULK_DATA_BUF_1_HASH_IDX,
+	GFNI_BULK_DATA_BUF_2_HASH_IDX,
 	HASH_IDXES
 };
 
@@ -241,6 +243,7 @@ test_toeplitz_hash_rand_data(void)
 	uint32_t hash[HASH_IDXES] = { 0 };
 	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
 	int i, j;
+	uint8_t *bulk_data[2];
 
 	if (!rte_thash_gfni_supported())
 		return TEST_SKIPPED;
@@ -248,6 +251,9 @@ test_toeplitz_hash_rand_data(void)
 	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
 		RTE_DIM(default_rss_key));
 
+	for (i = 0; i < 2; i++)
+		bulk_data[i] = (uint8_t *)data[i];
+
 	for (i = 0; i < ITER; i++) {
 		for (j = 0; j < DATA_SZ; j++) {
 			data[0][j] = rte_rand();
@@ -266,11 +272,18 @@ test_toeplitz_hash_rand_data(void)
 		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
 			rss_key_matrixes, (uint8_t *)data[1],
 			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_bulk(rss_key_matrixes,
+			DATA_SZ * sizeof(uint32_t), bulk_data,
+			&hash[GFNI_BULK_DATA_BUF_1_HASH_IDX], 2);
 
 		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
 				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_1_HASH_IDX]) ||
 				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
-				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_2_HASH_IDX]))
 
 			return -TEST_FAILED;
 	}
@@ -284,6 +297,57 @@ enum {
 };
 
 static int
+test_toeplitz_hash_gfni_bulk(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss[2] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_bulk(rss_key_matrixes, RTE_THASH_V6_L4_LEN * 4,
+			tuples, rss, 2);
+
+		if ((rss[RSS_V4_IDX] != v4_tbl[i].hash_l3l4) ||
+				(rss[RSS_V6_IDX] != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_big_tuple_gfni(void)
 {
 	uint32_t arr[16];
@@ -748,6 +812,7 @@ static struct unit_test_suite thash_tests = {
 	TEST_CASE(test_toeplitz_hash_calc),
 	TEST_CASE(test_toeplitz_hash_gfni),
 	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_bulk),
 	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index acdd8c3..61eaafd 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,11 +19,12 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are three functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
 * ``rte_thash_gfni()``
+* ``rte_thash_gfni_bulk()``
 
 First two functions are scalar implementation and take the parameters:
 
@@ -38,11 +39,12 @@ to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
-The last function is vectorized implementation using
-Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
-It expects the tuple to be in network byte order.
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
 
-``rte_thash_gfni()`` calculates the hash value for a single tuple
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_bulk()`` bulk implementation of the rte_thash_gfni().
 
 ``rte_thash_gfni()`` takes the parameters:
 
@@ -50,6 +52,14 @@ It expects the tuple to be in network byte order.
 * A pointer to the tuple.
 * A length of the tuple in bytes.
 
+``rte_thash_gfni_bulk()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A length of the longest tuple in bytes.
+* Array of the pointers on data to be hashed.
+* Array of ``uint32_t`` where to put calculated Toeplitz hash values
+* Number of tuples in a bulk.
+
 ``rte_thash_complete_matrix()`` is a function that calculates matrices required by
 GFNI implementations from the RSS hash key. It takes the parameters:
 
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
index bbacd41..e97d912 100644
--- a/lib/hash/rte_thash_gfni.h
+++ b/lib/hash/rte_thash_gfni.h
@@ -45,6 +45,39 @@ rte_thash_gfni(const uint64_t *mtrx __rte_unused,
 	return 0;
 }
 
+/**
+ * Bulk implementation for Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
+	int len __rte_unused, uint8_t *tuple[] __rte_unused,
+	uint32_t val[], uint32_t num)
+{
+	unsigned int i;
+
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	for (i = 0; i < num; i++)
+		val[i] = 0;
+}
+
 #endif /* RTE_THASH_GFNI_DEFINED */
 
 #ifdef __cplusplus
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
index 1cb7353..7bfb937 100644
--- a/lib/hash/rte_thash_x86_gfni.h
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -173,6 +173,45 @@ rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
 	return val;
 }
 
+/**
+ * Bulk implementation for Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
+	uint32_t val[], uint32_t num)
+{
+	uint32_t i;
+	uint32_t val_zero;
+	__m512i xor_acc;
+
+	for (i = 0; i != (num & ~1); i += 2) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
+		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
+	}
+
+	if (num & 1) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
+		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
+	}
+}
+
 #endif /* _GFNI_ */
 
 #ifdef __cplusplus
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v7 3/4] hash: enable gfni thash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (18 preceding siblings ...)
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 2/4] hash: add bulk " Vladimir Medvedkin
@ 2021-10-27 16:16 ` Vladimir Medvedkin
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
                   ` (5 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-27 16:16 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index e605a6f..242d0ff 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -87,6 +87,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -266,12 +268,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported()) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -385,6 +403,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -641,6 +663,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -752,11 +780,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 40146cf..c11ca0d 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -419,6 +419,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 153ab87..705c3f3 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -49,5 +49,6 @@ EXPERIMENTAL {
 
 	#added in 21.11
 	rte_thash_complete_matrix;
+	rte_thash_get_gfni_matrices;
 	rte_thash_gfni_supported;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v7 4/4] test/thash: add performance tests for the Toeplitz hash
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (19 preceding siblings ...)
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-10-27 16:16 ` Vladimir Medvedkin
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (4 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-10-27 16:16 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch adds performance tests for the following Toeplitz hash
function implementations:
  Scalar:
    - rte_softrss()
    - rte_softrss_be()
  Vector using gfni:
    - rte_thash_gfni()
    - rte_thash_gfni_bulk()

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 135 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 20f36a1..913e8f6 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -144,6 +144,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -322,6 +323,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+        'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..7aa9360
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static const uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+enum test_rss_type {
+	TEST_SOFTRSS,
+	TEST_SOFTRSS_BE,
+	TEST_RSS_GFNI
+};
+
+static inline uint64_t
+run_rss_calc(uint32_t *tuples[BATCH_SZ], enum test_rss_type type, int len,
+	const void *key)
+{
+	int i, j;
+	uint64_t start_tsc, end_tsc;
+	volatile uint32_t hash = 0;
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			if (type == TEST_SOFTRSS)
+				hash ^= rte_softrss(tuples[j], len /
+					sizeof(uint32_t), (const uint8_t *)key);
+			else if (type == TEST_SOFTRSS_BE)
+				hash ^= rte_softrss_be(tuples[j], len /
+					sizeof(uint32_t), (const uint8_t *)key);
+			else
+				hash ^= rte_thash_gfni((const uint64_t *)key,
+					(uint8_t *)tuples[j], len);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	return end_tsc - start_tsc;
+}
+
+static inline uint64_t
+run_rss_calc_bulk(uint32_t *tuples[BATCH_SZ], int len, const void *key)
+{
+	int i;
+	uint64_t start_tsc, end_tsc;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk((const uint64_t *)key, len,
+			(uint8_t **)tuples, bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	return end_tsc - start_tsc;
+}
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	uint64_t tsc_diff;
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	tsc_diff = run_rss_calc(tuples, TEST_SOFTRSS, len, default_rss_key);
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+
+	tsc_diff = run_rss_calc(tuples, TEST_SOFTRSS_BE, len,
+		default_rss_key);
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	tsc_diff = run_rss_calc(tuples, TEST_RSS_GFNI, len, rss_key_matrixes);
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+
+	tsc_diff = run_rss_calc_bulk(tuples, len, rss_key_matrixes);
+	printf("Average rte_thash_gfni_bulk takes \t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (20 preceding siblings ...)
  2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
@ 2021-11-02 18:38 ` Vladimir Medvedkin
  2021-11-04 10:20   ` Thomas Monjalon
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 1/4] hash: add new toeplitz " Vladimir Medvedkin
                   ` (3 subsequent siblings)
  25 siblings, 1 reply; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-11-02 18:38 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch series adds a new optimized implementation for the Toeplitz hash
function using Galois Fields New instruction (GFNI).
The main use case of this function is to calculate the hash value for a single
data, so there is no bulk implementation.
For performance reasons, the implementation was placed in a public header.
It is the responsibility of the user to ensure the platform supports GFNI
(by doing runtime checks of rte_thash_gfni_supported variable) before calling
these functions.

v8:
- rebased on the latest main
- fixed buffer overflow reported by ASAN

v7:
- reworked performance tests code

v6:
- addressed Thomas and Stephen's comments
- squash the doc and test commith with the relevant code addition
- split for 2 patches, each per implementation - single and bulk

v5:
- rebase on the latest main
- fix spelling

v4:
- included rte_log.h inside the rte_thash_gfni.h

v3:
- implementation moved to x86 specific header
- added rte_thash_gfni_supported() instead of the variable
- removed RTE_INIT section, due to adding rte_thash_gfni_supported()
- reworked rte_thash_complete_matrix() to make it easier tor read

v2:
- fixed typos
- made big_rss_key static const and indented
- addressed Konstantin's comments

Vladimir Medvedkin (4):
  hash: add new toeplitz hash implementation
  hash: add bulk toeplitz hash implementation
  hash: enable gfni thash implementation
  test/thash: add performance tests for the Toeplitz hash

 app/test/meson.build                        |   2 +
 app/test/test_thash.c                       | 237 ++++++++++++++++++++++++++++
 app/test/test_thash_perf.c                  | 135 ++++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  38 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  72 ++++++++-
 lib/hash/rte_thash.h                        |  54 +++++++
 lib/hash/rte_thash_gfni.h                   |  87 ++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 223 ++++++++++++++++++++++++++
 lib/hash/version.map                        |   6 +
 12 files changed, 856 insertions(+), 9 deletions(-)
 create mode 100644 app/test/test_thash_perf.c
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v8 1/4] hash: add new toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (21 preceding siblings ...)
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-11-02 18:38 ` Vladimir Medvedkin
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 2/4] hash: add bulk " Vladimir Medvedkin
                   ` (2 subsequent siblings)
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-11-02 18:38 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 172 ++++++++++++++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  28 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  30 +++++
 lib/hash/rte_thash.h                        |  35 ++++++
 lib/hash/rte_thash_gfni.h                   |  54 ++++++++
 lib/hash/rte_thash_x86_gfni.h               | 183 ++++++++++++++++++++++++++++
 lib/hash/version.map                        |   5 +
 10 files changed, 513 insertions(+), 5 deletions(-)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..22d784e 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,146 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +746,9 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 49892a3..4245b96 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -142,6 +142,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..acdd8c3 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,44 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are three functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of above mentioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last function is vectorized implementation using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
+It expects the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 502cc5c..c22617b 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -314,6 +314,10 @@ New Features
   overruns in C/C++ programs, and other similar errors, as well as
   printing out detailed debug information whenever an error is detected.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..12b1afc 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,8 +7,12 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
+)
+indirect_headers += files(
+        'rte_crc_arm64.h',
+        'rte_thash_x86_gfni.h',
 )
-indirect_headers += files('rte_crc_arm64.h')
 
 sources = files('rte_cuckoo_hash.c', 'rte_fbk_hash.c', 'rte_thash.c')
 deps += ['net']
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..9d66a5d 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,36 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[(i + 1) % size]) >>
+				(8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a26fe56..40146cf 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -24,6 +24,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -219,6 +220,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..bbacd41
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_log.h>
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..53486b6
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ *  Note that @p len should not exceed the length of the rss_key minus 4.
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 8185470..153ab87 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -37,6 +37,7 @@ DPDK_22 {
 EXPERIMENTAL {
 	global:
 
+	#added in 21.05
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
 	rte_thash_find_existing;
@@ -45,4 +46,8 @@ EXPERIMENTAL {
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_init_ctx;
+
+	#added in 21.11
+	rte_thash_complete_matrix;
+	rte_thash_gfni_supported;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v8 2/4] hash: add bulk toeplitz hash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (22 preceding siblings ...)
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 1/4] hash: add new toeplitz " Vladimir Medvedkin
@ 2021-11-02 18:38 ` Vladimir Medvedkin
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-11-02 18:38 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch adds a bulk version for the Toeplitz hash implemented
with Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 67 ++++++++++++++++++++++++++++-
 doc/guides/prog_guide/toeplitz_hash_lib.rst | 20 ++++++---
 lib/hash/rte_thash_gfni.h                   | 33 ++++++++++++++
 lib/hash/rte_thash_x86_gfni.h               | 40 +++++++++++++++++
 4 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index 22d784e..a625306 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -230,6 +230,8 @@ enum {
 	SCALAR_DATA_BUF_2_HASH_IDX,
 	GFNI_DATA_BUF_1_HASH_IDX,
 	GFNI_DATA_BUF_2_HASH_IDX,
+	GFNI_BULK_DATA_BUF_1_HASH_IDX,
+	GFNI_BULK_DATA_BUF_2_HASH_IDX,
 	HASH_IDXES
 };
 
@@ -241,6 +243,7 @@ test_toeplitz_hash_rand_data(void)
 	uint32_t hash[HASH_IDXES] = { 0 };
 	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
 	int i, j;
+	uint8_t *bulk_data[2];
 
 	if (!rte_thash_gfni_supported())
 		return TEST_SKIPPED;
@@ -248,6 +251,9 @@ test_toeplitz_hash_rand_data(void)
 	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
 		RTE_DIM(default_rss_key));
 
+	for (i = 0; i < 2; i++)
+		bulk_data[i] = (uint8_t *)data[i];
+
 	for (i = 0; i < ITER; i++) {
 		for (j = 0; j < DATA_SZ; j++) {
 			data[0][j] = rte_rand();
@@ -266,11 +272,18 @@ test_toeplitz_hash_rand_data(void)
 		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
 			rss_key_matrixes, (uint8_t *)data[1],
 			DATA_SZ * sizeof(uint32_t));
+		rte_thash_gfni_bulk(rss_key_matrixes,
+			DATA_SZ * sizeof(uint32_t), bulk_data,
+			&hash[GFNI_BULK_DATA_BUF_1_HASH_IDX], 2);
 
 		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
 				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_1_HASH_IDX]) ||
 				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
-				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+				hash[GFNI_DATA_BUF_2_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_BULK_DATA_BUF_2_HASH_IDX]))
 
 			return -TEST_FAILED;
 	}
@@ -284,6 +297,57 @@ enum {
 };
 
 static int
+test_toeplitz_hash_gfni_bulk(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple[2];
+	uint8_t *tuples[2];
+	uint32_t rss[2] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(tuples); i++) {
+		/* allocate memory enough for a biggest tuple */
+		tuples[i] = rte_zmalloc(NULL, RTE_THASH_V6_L4_LEN * 4, 0);
+		if (tuples[i] == NULL)
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_MIN(RTE_DIM(v4_tbl), RTE_DIM(v6_tbl)); i++) {
+		/*Load IPv4 headers and copy it into the corresponding tuple*/
+		tuple[0].v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple[0].v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple[0].v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple[0].v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+		rte_memcpy(tuples[0], &tuple[0], RTE_THASH_V4_L4_LEN * 4);
+
+		/*Load IPv6 headers and copy it into the corresponding tuple*/
+		for (j = 0; j < RTE_DIM(tuple[1].v6.src_addr); j++)
+			tuple[1].v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple[1].v6.dst_addr); j++)
+			tuple[1].v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple[1].v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple[1].v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rte_memcpy(tuples[1], &tuple[1], RTE_THASH_V6_L4_LEN * 4);
+
+		rte_thash_gfni_bulk(rss_key_matrixes, RTE_THASH_V6_L4_LEN * 4,
+			tuples, rss, 2);
+
+		if ((rss[RSS_V4_IDX] != v4_tbl[i].hash_l3l4) ||
+				(rss[RSS_V6_IDX] != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_big_tuple_gfni(void)
 {
 	uint32_t arr[16];
@@ -748,6 +812,7 @@ static struct unit_test_suite thash_tests = {
 	TEST_CASE(test_toeplitz_hash_calc),
 	TEST_CASE(test_toeplitz_hash_gfni),
 	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_toeplitz_hash_gfni_bulk),
 	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index acdd8c3..61eaafd 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,11 +19,12 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are three functions that provide calculation of the Toeplitz hash sum:
+There are four functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
 * ``rte_thash_gfni()``
+* ``rte_thash_gfni_bulk()``
 
 First two functions are scalar implementation and take the parameters:
 
@@ -38,11 +39,12 @@ to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
-The last function is vectorized implementation using
-Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
-It expects the tuple to be in network byte order.
+The last two functions are vectorized implementations using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` is true.
+They expect the tuple to be in network byte order.
 
-``rte_thash_gfni()`` calculates the hash value for a single tuple
+``rte_thash_gfni()`` calculates the hash value for a single tuple, and
+``rte_thash_gfni_bulk()`` bulk implementation of the rte_thash_gfni().
 
 ``rte_thash_gfni()`` takes the parameters:
 
@@ -50,6 +52,14 @@ It expects the tuple to be in network byte order.
 * A pointer to the tuple.
 * A length of the tuple in bytes.
 
+``rte_thash_gfni_bulk()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A length of the longest tuple in bytes.
+* Array of the pointers on data to be hashed.
+* Array of ``uint32_t`` where to put calculated Toeplitz hash values
+* Number of tuples in a bulk.
+
 ``rte_thash_complete_matrix()`` is a function that calculates matrices required by
 GFNI implementations from the RSS hash key. It takes the parameters:
 
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
index bbacd41..e97d912 100644
--- a/lib/hash/rte_thash_gfni.h
+++ b/lib/hash/rte_thash_gfni.h
@@ -45,6 +45,39 @@ rte_thash_gfni(const uint64_t *mtrx __rte_unused,
 	return 0;
 }
 
+/**
+ * Bulk implementation for Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx __rte_unused,
+	int len __rte_unused, uint8_t *tuple[] __rte_unused,
+	uint32_t val[], uint32_t num)
+{
+	unsigned int i;
+
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	for (i = 0; i < num; i++)
+		val[i] = 0;
+}
+
 #endif /* RTE_THASH_GFNI_DEFINED */
 
 #ifdef __cplusplus
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
index 53486b6..c2889c3 100644
--- a/lib/hash/rte_thash_x86_gfni.h
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -174,6 +174,46 @@ rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
 	return val;
 }
 
+/**
+ * Bulk implementation for Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ *  Note that @p len should not exceed the length of the rss_key minus 4.
+ * @param len
+ *  Length of the largest data buffer to be hashed.
+ * @param tuple
+ *  Array of the pointers on data to be hashed.
+ *  Data must be in network byte order.
+ * @param val
+ *  Array of uint32_t where to put calculated Toeplitz hash values
+ * @param num
+ *  Number of tuples to hash.
+ */
+__rte_experimental
+static inline void
+rte_thash_gfni_bulk(const uint64_t *mtrx, int len, uint8_t *tuple[],
+	uint32_t val[], uint32_t num)
+{
+	uint32_t i;
+	uint32_t val_zero;
+	__m512i xor_acc;
+
+	for (i = 0; i != (num & ~1); i += 2) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], tuple[i + 1], len);
+		__rte_thash_xor_reduce(xor_acc, val + i, val + i + 1);
+	}
+
+	if (num & 1) {
+		xor_acc = __rte_thash_gfni(mtrx, tuple[i], NULL, len);
+		__rte_thash_xor_reduce(xor_acc, val + i, &val_zero);
+	}
+}
+
 #endif /* _GFNI_ */
 
 #ifdef __cplusplus
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v8 3/4] hash: enable gfni thash implementation
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (23 preceding siblings ...)
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 2/4] hash: add bulk " Vladimir Medvedkin
@ 2021-11-02 18:38 ` Vladimir Medvedkin
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-11-02 18:38 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch enables new GFNI Toeplitz hash in
predictable RSS library.

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/hash/rte_thash.c | 42 ++++++++++++++++++++++++++++++++++++++----
 lib/hash/rte_thash.h | 19 +++++++++++++++++++
 lib/hash/version.map |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 9d66a5d..6945a0a 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -87,6 +87,8 @@ struct rte_thash_ctx {
 	uint32_t	reta_sz_log;	/** < size of the RSS ReTa in bits */
 	uint32_t	subtuples_nb;	/** < number of subtuples */
 	uint32_t	flags;
+	uint64_t	*matrices;
+	/**< matrices used with rte_thash_gfni implementation */
 	uint8_t		hash_key[0];
 };
 
@@ -267,12 +269,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
 			ctx->hash_key[i] = rte_rand();
 	}
 
+	if (rte_thash_gfni_supported()) {
+		ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+			RTE_CACHE_LINE_SIZE);
+		if (ctx->matrices == NULL) {
+			RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+			rte_errno = ENOMEM;
+			goto free_ctx;
+		}
+
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			key_len);
+	}
+
 	te->data = (void *)ctx;
 	TAILQ_INSERT_TAIL(thash_list, te, next);
 
 	rte_mcfg_tailq_write_unlock();
 
 	return ctx;
+
+free_ctx:
+	rte_free(ctx);
 free_te:
 	rte_free(te);
 exit:
@@ -386,6 +404,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
 			set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
 	}
 
+	if (ctx->matrices != NULL)
+		rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+			ctx->key_len);
+
 	return 0;
 }
 
@@ -642,6 +664,12 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
 	return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+	return ctx->matrices;
+}
+
 static inline uint8_t
 read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
 {
@@ -753,11 +781,17 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 	attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
 
 	for (i = 0; i < attempts; i++) {
-		for (j = 0; j < (tuple_len / 4); j++)
-			tmp_tuple[j] =
-				rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+		if (ctx->matrices != NULL)
+			hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+		else {
+			for (j = 0; j < (tuple_len / 4); j++)
+				tmp_tuple[j] =
+					rte_be_to_cpu_32(
+						*(uint32_t *)&tuple[j * 4]);
+
+			hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+		}
 
-		hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
 		adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
 		/*
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 40146cf..c11ca0d 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -419,6 +419,25 @@ const uint8_t *
 rte_thash_get_key(struct rte_thash_ctx *ctx);
 
 /**
+ * Get a pointer to the toeplitz hash matrices contained in the context.
+ * These matrices could be used with fast toeplitz hash implementation if
+ * CPU supports GFNI.
+ * Matrices changes after each addition of a helper.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param ctx
+ *  Thash context
+ * @return
+ *  A pointer to the toeplitz hash key matrices on success
+ *  NULL if GFNI is not supported.
+ */
+__rte_experimental
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx);
+
+/**
  * Function prototype for the rte_thash_adjust_tuple
  * to check if adjusted tuple could be used.
  * Generally it is some kind of lookup function to check
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 153ab87..705c3f3 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -49,5 +49,6 @@ EXPERIMENTAL {
 
 	#added in 21.11
 	rte_thash_complete_matrix;
+	rte_thash_get_gfni_matrices;
 	rte_thash_gfni_supported;
 };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* [dpdk-dev] [PATCH v8 4/4] test/thash: add performance tests for the Toeplitz hash
  2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
                   ` (24 preceding siblings ...)
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
@ 2021-11-02 18:38 ` Vladimir Medvedkin
  25 siblings, 0 replies; 72+ messages in thread
From: Vladimir Medvedkin @ 2021-11-02 18:38 UTC (permalink / raw)
  To: dev
  Cc: yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen, thomas

This patch adds performance tests for the following Toeplitz hash
function implementations:
  Scalar:
    - rte_softrss()
    - rte_softrss_be()
  Vector using gfni:
    - rte_thash_gfni()
    - rte_thash_gfni_bulk()

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 app/test/meson.build       |   2 +
 app/test/test_thash_perf.c | 135 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 app/test/test_thash_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 20f36a1..913e8f6 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -144,6 +144,7 @@ test_sources = files(
         'test_table_tables.c',
         'test_tailq.c',
         'test_thash.c',
+        'test_thash_perf.c',
         'test_timer.c',
         'test_timer_perf.c',
         'test_timer_racecond.c',
@@ -322,6 +323,7 @@ perf_test_names = [
         'hash_readwrite_lf_perf_autotest',
         'trace_perf_autotest',
         'ipsec_perf_autotest',
+        'thash_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_thash_perf.c b/app/test/test_thash_perf.c
new file mode 100644
index 0000000..7aa9360
--- /dev/null
+++ b/app/test/test_thash_perf.c
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_thash.h>
+
+#include "test.h"
+
+#define ITERATIONS	(1 << 15)
+#define BATCH_SZ	(1 << 10)
+
+#define IPV4_2_TUPLE_LEN	(8)
+#define IPV4_4_TUPLE_LEN	(12)
+#define IPV6_2_TUPLE_LEN	(32)
+#define IPV6_4_TUPLE_LEN	(36)
+
+
+static const uint8_t default_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+enum test_rss_type {
+	TEST_SOFTRSS,
+	TEST_SOFTRSS_BE,
+	TEST_RSS_GFNI
+};
+
+static inline uint64_t
+run_rss_calc(uint32_t *tuples[BATCH_SZ], enum test_rss_type type, int len,
+	const void *key)
+{
+	int i, j;
+	uint64_t start_tsc, end_tsc;
+	volatile uint32_t hash = 0;
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < BATCH_SZ; j++) {
+			if (type == TEST_SOFTRSS)
+				hash ^= rte_softrss(tuples[j], len /
+					sizeof(uint32_t), (const uint8_t *)key);
+			else if (type == TEST_SOFTRSS_BE)
+				hash ^= rte_softrss_be(tuples[j], len /
+					sizeof(uint32_t), (const uint8_t *)key);
+			else
+				hash ^= rte_thash_gfni((const uint64_t *)key,
+					(uint8_t *)tuples[j], len);
+		}
+	}
+	end_tsc = rte_rdtsc_precise();
+
+	return end_tsc - start_tsc;
+}
+
+static inline uint64_t
+run_rss_calc_bulk(uint32_t *tuples[BATCH_SZ], int len, const void *key)
+{
+	int i;
+	uint64_t start_tsc, end_tsc;
+	uint32_t bulk_hash[BATCH_SZ] = { 0 };
+
+	start_tsc = rte_rdtsc_precise();
+	for (i = 0; i < ITERATIONS; i++)
+		rte_thash_gfni_bulk((const uint64_t *)key, len,
+			(uint8_t **)tuples, bulk_hash, BATCH_SZ);
+
+	end_tsc = rte_rdtsc_precise();
+
+	return end_tsc - start_tsc;
+}
+
+static void
+run_thash_test(unsigned int tuple_len)
+{
+	uint32_t *tuples[BATCH_SZ];
+	unsigned int i, j;
+	uint32_t len = RTE_ALIGN_CEIL(tuple_len, sizeof(uint32_t));
+	uint64_t tsc_diff;
+
+	for (i = 0; i < BATCH_SZ; i++) {
+		tuples[i] = rte_zmalloc(NULL, len, 0);
+		for (j = 0; j < len / sizeof(uint32_t); j++)
+			tuples[i][j] = rte_rand();
+	}
+
+	tsc_diff = run_rss_calc(tuples, TEST_SOFTRSS, len, default_rss_key);
+	printf("Average rte_softrss() takes \t\t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+
+	tsc_diff = run_rss_calc(tuples, TEST_SOFTRSS_BE, len,
+		default_rss_key);
+	printf("Average rte_softrss_be() takes \t\t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+
+	if (!rte_thash_gfni_supported())
+		return;
+
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	tsc_diff = run_rss_calc(tuples, TEST_RSS_GFNI, len, rss_key_matrixes);
+	printf("Average rte_thash_gfni takes \t\t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+
+	tsc_diff = run_rss_calc_bulk(tuples, len, rss_key_matrixes);
+	printf("Average rte_thash_gfni_bulk takes \t%.1f cycles for key len %d\n",
+		(double)(tsc_diff) / (double)(ITERATIONS * BATCH_SZ), len);
+}
+
+static int
+test_thash_perf(void)
+{
+	run_thash_test(IPV4_2_TUPLE_LEN);
+	run_thash_test(IPV4_4_TUPLE_LEN);
+	run_thash_test(IPV6_2_TUPLE_LEN);
+	run_thash_test(IPV6_4_TUPLE_LEN);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(thash_perf_autotest, test_thash_perf);
-- 
2.7.4


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation
  2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
@ 2021-11-04 10:20   ` Thomas Monjalon
  0 siblings, 0 replies; 72+ messages in thread
From: Thomas Monjalon @ 2021-11-04 10:20 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: dev, yipeng1.wang, sameh.gobriel, bruce.richardson,
	konstantin.ananyev, stephen

02/11/2021 19:38, Vladimir Medvedkin:
> This patch series adds a new optimized implementation for the Toeplitz hash
> function using Galois Fields New instruction (GFNI).
> The main use case of this function is to calculate the hash value for a single
> data, so there is no bulk implementation.
> For performance reasons, the implementation was placed in a public header.
> It is the responsibility of the user to ensure the platform supports GFNI
> (by doing runtime checks of rte_thash_gfni_supported variable) before calling
> these functions.
> 
> v8:
> - rebased on the latest main
> - fixed buffer overflow reported by ASAN
> 
> v7:
> - reworked performance tests code
> 
> v6:
> - addressed Thomas and Stephen's comments
> - squash the doc and test commith with the relevant code addition
> - split for 2 patches, each per implementation - single and bulk
> 
> v5:
> - rebase on the latest main
> - fix spelling
> 
> v4:
> - included rte_log.h inside the rte_thash_gfni.h
> 
> v3:
> - implementation moved to x86 specific header
> - added rte_thash_gfni_supported() instead of the variable
> - removed RTE_INIT section, due to adding rte_thash_gfni_supported()
> - reworked rte_thash_complete_matrix() to make it easier tor read
> 
> v2:
> - fixed typos
> - made big_rss_key static const and indented
> - addressed Konstantin's comments
> 
> Vladimir Medvedkin (4):
>   hash: add new toeplitz hash implementation
>   hash: add bulk toeplitz hash implementation
>   hash: enable gfni thash implementation
>   test/thash: add performance tests for the Toeplitz hash

Applied, thanks.




^ permalink raw reply	[flat|nested] 72+ messages in thread

end of thread, other threads:[~2021-11-04 10:20 UTC | newest]

Thread overview: 72+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-09-06 16:03 ` [dpdk-dev] [PATCH 1/5] hash: add new toeplitz " Vladimir Medvedkin
2021-10-07 18:23   ` Ananyev, Konstantin
2021-10-08 11:19     ` Ananyev, Konstantin
2021-10-15  9:11     ` Medvedkin, Vladimir
2021-10-15 10:55       ` Ananyev, Konstantin
2021-10-15 13:09         ` Medvedkin, Vladimir
2021-09-06 16:03 ` [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-08 11:31   ` Ananyev, Konstantin
2021-10-15  9:13     ` Medvedkin, Vladimir
2021-09-06 16:03 ` [dpdk-dev] [PATCH 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-09-06 16:03 ` [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-09-07  0:35   ` Stephen Hemminger
2021-09-08 13:59     ` Medvedkin, Vladimir
2021-09-06 16:03 ` [dpdk-dev] [PATCH 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 " Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz " Vladimir Medvedkin
2021-10-25 17:05         ` Thomas Monjalon
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-25 17:04         ` Thomas Monjalon
2021-10-26 20:30           ` Medvedkin, Vladimir
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-25 17:02         ` Thomas Monjalon
2021-10-26 20:29           ` Medvedkin, Vladimir
2021-10-27  8:29             ` Thomas Monjalon
2021-10-27 15:48               ` Medvedkin, Vladimir
2021-10-25 17:27         ` Stephen Hemminger
2021-10-26 20:31           ` Medvedkin, Vladimir
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
2021-10-21  9:42     ` Ananyev, Konstantin
2021-10-21 17:17       ` Medvedkin, Vladimir
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-21  9:46     ` Ananyev, Konstantin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
2021-10-15 16:58   ` Stephen Hemminger
2021-10-18 10:40     ` Ananyev, Konstantin
2021-10-19  1:15       ` Stephen Hemminger
2021-10-19 15:42         ` Medvedkin, Vladimir
2021-10-18 11:08     ` Medvedkin, Vladimir
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 1/4] hash: add new toeplitz " Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 2/4] hash: add bulk " Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 1/4] hash: add new toeplitz " Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 2/4] hash: add bulk " Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-11-04 10:20   ` Thomas Monjalon
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 1/4] hash: add new toeplitz " Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 2/4] hash: add bulk " Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).