DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 1/3] ip_frag: optimize key compare and hash generation
@ 2023-05-23 12:54 pbhagavatula
  2023-05-23 12:54 ` [PATCH 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-23 12:54 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd, Ruifeng Wang, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use optimized rte_hash_k32_cmp_eq routine for key comparison for
x86 and ARM64.
Use CRC instructions for hash generation on ARM64.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
 lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
 lib/ip_frag/ip_frag_common.h   | 17 ++++++++++-------
 lib/ip_frag/ip_frag_internal.c |  4 ++--
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
index e9e26f9abd..a3e85635eb 100644
--- a/lib/hash/rte_cmp_arm64.h
+++ b/lib/hash/rte_cmp_arm64.h
@@ -3,7 +3,7 @@
  */
 
 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 		    size_t key_len __rte_unused)
 {
@@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 	return !(x0 == 0 && x1 == 0);
 }
 
-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }
 
-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }
 
-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }
 
-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }
 
-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }
 
-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }
 
-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
index 13a5836351..ddfbef462f 100644
--- a/lib/hash/rte_cmp_x86.h
+++ b/lib/hash/rte_cmp_x86.h
@@ -5,7 +5,7 @@
 #include <rte_vect.h>
 
 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
 {
 	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
@@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
 	return !_mm_test_all_zeros(x, x);
 }
 
-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }
 
-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }
 
-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }
 
-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }
 
-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }
 
-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }
 
-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
index 0d8ce6a1e1..816dc5b985 100644
--- a/lib/ip_frag/ip_frag_common.h
+++ b/lib/ip_frag/ip_frag_common.h
@@ -5,7 +5,13 @@
 #ifndef _IP_FRAG_COMMON_H_
 #define _IP_FRAG_COMMON_H_
 
-#include <sys/queue.h>
+#include <rte_common.h>
+
+#ifdef RTE_ARCH_ARM64
+#include <rte_cmp_arm64.h>
+#else
+#include <rte_cmp_x86.h>
+#endif
 
 #include "rte_ip_frag.h"
 #include "ip_reassembly.h"
@@ -75,12 +81,9 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
 static inline uint64_t
 ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
 {
-	uint32_t i;
-	uint64_t val;
-	val = k1->id_key_len ^ k2->id_key_len;
-	for (i = 0; i < k1->key_len; i++)
-		val |= k1->src_dst[i] ^ k2->src_dst[i];
-	return val;
+	return (k1->id_key_len != k2->id_key_len) ||
+	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
+					     rte_hash_k32_cmp_eq(k1, k2, 32));
 }
 
 /*
diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index b436a4c931..7cbef647df 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)
 
 	p = (const uint32_t *)&key->src_dst;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(key->id, v);
@@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)
 
 	p = (const uint32_t *) &key->src_dst;
 
-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(p[2], v);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 12:54 [PATCH 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
@ 2023-05-23 12:54 ` pbhagavatula
  2023-05-23 12:54 ` [PATCH 3/3] test: add reassembly perf test pbhagavatula
  2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
  2 siblings, 0 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-23 12:54 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd, Konstantin Ananyev; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve reassembly lookup performance by using NEON intrinsics for
key validation.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
 lib/ip_frag/ip_reassembly.h      |   6 +
 lib/ip_frag/rte_ip_frag_common.c |  10 ++
 3 files changed, 196 insertions(+), 44 deletions(-)

diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index 7cbef647df..de78a0ed8f 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -4,8 +4,9 @@

 #include <stddef.h>

-#include <rte_jhash.h>
 #include <rte_hash_crc.h>
+#include <rte_jhash.h>
+#include <rte_vect.h>

 #include "ip_frag_common.h"

@@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
 	return pkt;
 }

-struct ip_frag_pkt *
-ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
-	const struct ip_frag_key *key, uint64_t tms,
-	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+static inline void
+ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
+	    uint32_t list_idx, uint32_t list_cnt)
+{
+	RTE_SET_USED(tbl);
+	RTE_SET_USED(list_idx);
+	RTE_SET_USED(list_cnt);
+	if (p->key.key_len == IPV4_KEYLEN)
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    p->key.src_dst[0], p->key.id, p->start);
+	else
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <" IPv6_KEY_BYTES_FMT
+			    ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
+			    p->start);
+}
+
+#if defined(RTE_ARCH_ARM64)
+static inline struct ip_frag_pkt *
+ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	struct ip_frag_pkt *empty, *old;
+	struct ip_frag_pkt *p1, *p2;
+	uint32_t assoc, sig1, sig2;
+	uint64_t max_cycles;
+
+	empty = NULL;
+	old = NULL;
+
+	max_cycles = tbl->max_cycles;
+	assoc = tbl->bucket_entries;
+
+	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
+		return tbl->last;
+
+	/* different hashing methods for IPv4 and IPv6 */
+	if (key->key_len == IPV4_KEYLEN)
+		ipv4_frag_hash(key, &sig1, &sig2);
+	else
+		ipv6_frag_hash(key, &sig1, &sig2);
+
+	p1 = IP_FRAG_TBL_POS(tbl, sig1);
+	p2 = IP_FRAG_TBL_POS(tbl, sig2);
+
+	uint64x2_t key0, key1, key2, key3;
+	uint64_t vmask, zmask, ts_mask;
+	uint64x2_t ts0, ts1;
+	uint32x4_t nz_key;
+	uint8_t idx;
+	/* Bucket entries are always power of 2. */
+	rte_prefetch0(&p1[0].key);
+	rte_prefetch0(&p1[1].key);
+	rte_prefetch0(&p2[0].key);
+	rte_prefetch0(&p2[1].key);
+
+	while (assoc > 1) {
+		if (assoc > 2) {
+			rte_prefetch0(&p1[2].key);
+			rte_prefetch0(&p1[3].key);
+			rte_prefetch0(&p2[2].key);
+			rte_prefetch0(&p2[3].key);
+		}
+		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
+		key0 = vld1q_u64(&p[0]->key.id_key_len);
+		key1 = vld1q_u64(&p[1]->key.id_key_len);
+		key2 = vld1q_u64(&p[2]->key.id_key_len);
+		key3 = vld1q_u64(&p[3]->key.id_key_len);
+
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3), 1), nz_key, 3);
+
+		nz_key = vceqzq_u32(nz_key);
+		zmask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
+		vmask = ~zmask;
+
+		vmask &= 0x8000800080008000;
+		for (; vmask > 0; vmask &= vmask - 1) {
+			idx = __builtin_ctzll(vmask) >> 4;
+			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
+				return p[idx];
+		}
+
+		vmask = ~zmask;
+		if (zmask && empty == NULL) {
+			zmask &= 0x8000800080008000;
+			idx = __builtin_ctzll(zmask) >> 4;
+			empty = p[idx];
+		}
+
+		if (vmask && old == NULL) {
+			const uint64x2_t max_cyc = vdupq_n_u64(max_cycles);
+			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
+
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0, 0);
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0, 1);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1, 0);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1, 1);
+
+			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
+			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
+
+			ts_mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
+							vuzp1q_u32(vreinterpretq_u32_u64(ts0),
+								   vreinterpretq_u32_u64(ts1)),
+							16)),
+						0);
+			vmask &= 0x8000800080008000;
+			ts_mask &= vmask;
+			if (ts_mask) {
+				idx = __builtin_ctzll(ts_mask) >> 4;
+				old = p[idx];
+			}
+		}
+		p1 += 2;
+		p2 += 2;
+		assoc -= 4;
+	}
+	while (assoc) {
+		if (ip_frag_key_cmp(key, &p1->key) == 0)
+			return p1;
+		else if (ip_frag_key_is_empty(&p1->key))
+			empty = (empty == NULL) ? p1 : empty;
+		else if (max_cycles + p1->start < tms)
+			old = (old == NULL) ? p1 : old;
+
+		if (ip_frag_key_cmp(key, &p2->key) == 0)
+			return p2;
+		else if (ip_frag_key_is_empty(&p2->key))
+			empty = (empty == NULL) ? p2 : empty;
+		else if (max_cycles + p2->start < tms)
+			old = (old == NULL) ? p2 : old;
+		p1++;
+		p2++;
+		assoc--;
+	}
+
+	*free = empty;
+	*stale = old;
+	return NULL;
+}
+#endif
+
+static struct ip_frag_pkt *
+ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
 {
 	struct ip_frag_pkt *p1, *p2;
 	struct ip_frag_pkt *empty, *old;
@@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	p2 = IP_FRAG_TBL_POS(tbl, sig2);

 	for (i = 0; i != assoc; i++) {
-		if (p1->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start);
-
+		ip_frag_dbg(tbl, &p1[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
 			return p1 + i;
 		else if (ip_frag_key_is_empty(&p1[i].key))
@@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 		else if (max_cycles + p1[i].start < tms)
 			old = (old == NULL) ? (p1 + i) : old;

-		if (p2->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start);
-
+		ip_frag_dbg(tbl, &p2[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
 			return p2 + i;
 		else if (ip_frag_key_is_empty(&p2[i].key))
-			empty = (empty == NULL) ?( p2 + i) : empty;
+			empty = (empty == NULL) ? (p2 + i) : empty;
 		else if (max_cycles + p2[i].start < tms)
 			old = (old == NULL) ? (p2 + i) : old;
 	}
@@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	*stale = old;
 	return NULL;
 }
+
+struct ip_frag_pkt *
+ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	switch (tbl->lookup_fn) {
+#if defined(RTE_ARCH_ARM64)
+	case REASSEMBLY_LOOKUP_NEON:
+		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
+#endif
+	case REASSEMBLY_LOOKUP_SCALAR:
+	default:
+		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
+	}
+}
diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
index ef9d8c0d75..049437ae32 100644
--- a/lib/ip_frag/ip_reassembly.h
+++ b/lib/ip_frag/ip_reassembly.h
@@ -12,6 +12,11 @@

 #include <rte_ip_frag.h>

+enum ip_frag_lookup_func {
+	REASSEMBLY_LOOKUP_SCALAR = 0,
+	REASSEMBLY_LOOKUP_NEON,
+};
+
 enum {
 	IP_LAST_FRAG_IDX,    /* index of last fragment */
 	IP_FIRST_FRAG_IDX,   /* index of first fragment */
@@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
 	struct ip_frag_pkt *last;     /* last used entry. */
 	struct ip_pkt_list lru;       /* LRU list for table entries. */
 	struct ip_frag_tbl_stat stat; /* statistics counters. */
+	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function. */
 	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */
 };

diff --git a/lib/ip_frag/rte_ip_frag_common.c b/lib/ip_frag/rte_ip_frag_common.c
index c1de2e81b6..ef3c104e45 100644
--- a/lib/ip_frag/rte_ip_frag_common.c
+++ b/lib/ip_frag/rte_ip_frag_common.c
@@ -5,7 +5,9 @@
 #include <stddef.h>
 #include <stdio.h>

+#include <rte_cpuflags.h>
 #include <rte_log.h>
+#include <rte_vect.h>

 #include "ip_frag_common.h"

@@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 	tbl->bucket_entries = bucket_entries;
 	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);

+#if defined(RTE_ARCH_ARM64)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
+	else
+#endif
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
+
 	TAILQ_INIT(&(tbl->lru));
 	return tbl;
 }
--
2.39.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 3/3] test: add reassembly perf test
  2023-05-23 12:54 [PATCH 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-05-23 12:54 ` [PATCH 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
@ 2023-05-23 12:54 ` pbhagavatula
  2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
  2 siblings, 0 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-23 12:54 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
Each test is performed with variable number of fragments per flow,
either ordered or unordered fragments and interleaved flows.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v4 Changes:
 - Rebase to master.
 v3 Changes:
 - Fix checkpatch issues.
 v2 Changes
 - Rebase to master, reduce memory consumption, set default mempool ops
 to ring_mp_mc.

 app/test/meson.build            |    2 +
 app/test/test_reassembly_perf.c | 1001 +++++++++++++++++++++++++++++++
 2 files changed, 1003 insertions(+)
 create mode 100644 app/test/test_reassembly_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index b9b5432496..8cc4f03db8 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -108,6 +108,7 @@ test_sources = files(
         'test_rawdev.c',
         'test_rcu_qsbr.c',
         'test_rcu_qsbr_perf.c',
+        'test_reassembly_perf.c',
         'test_reciprocal_division.c',
         'test_reciprocal_division_perf.c',
         'test_red.c',
@@ -297,6 +298,7 @@ perf_test_names = [
         'trace_perf_autotest',
         'ipsec_perf_autotest',
         'thash_perf_autotest',
+        'reassembly_perf_autotest',
 ]

 driver_test_names = [
diff --git a/app/test/test_reassembly_perf.c b/app/test/test_reassembly_perf.c
new file mode 100644
index 0000000000..850485a9c5
--- /dev/null
+++ b/app/test/test_reassembly_perf.c
@@ -0,0 +1,1001 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell.
+ */
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_hexdump.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_random.h>
+#include <rte_udp.h>
+
+#include "test.h"
+
+#define MAX_FLOWS	    (1024 * 32)
+#define MAX_BKTS	    MAX_FLOWS
+#define MAX_ENTRIES_PER_BKT 16
+#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define MIN_FRAGMENTS	    2
+#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
+
+#define MAX_PKT_LEN 2048
+#define MAX_TTL_MS  (5 * MS_PER_S)
+
+/* use RFC863 Discard Protocol */
+#define UDP_SRC_PORT 9
+#define UDP_DST_PORT 9
+
+/* use RFC5735 / RFC2544 reserved network test addresses */
+#define IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x))
+#define IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 8) | (x))
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#define IP6_VERSION 6
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+
+static struct rte_ip_frag_tbl *frag_tbl;
+static struct rte_mempool *pkt_pool;
+static struct rte_mbuf *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
+static uint8_t frag_per_flow[MAX_FLOWS];
+static uint32_t flow_cnt;
+
+#define FILL_MODE_LINEAR      0
+#define FILL_MODE_RANDOM      1
+#define FILL_MODE_INTERLEAVED 2
+
+static int
+reassembly_test_setup(void)
+{
+	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
+
+	frag_tbl = rte_ip_frag_table_create(MAX_FLOWS, MAX_ENTRIES_PER_BKT,
+					    MAX_FLOWS * MAX_ENTRIES_PER_BKT,
+					    max_ttl_cyc, rte_socket_id());
+	if (frag_tbl == NULL)
+		return TEST_FAILED;
+
+	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
+	pkt_pool = rte_pktmbuf_pool_create(
+		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS, 0, 0,
+		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
+	if (pkt_pool == NULL) {
+		printf("[%s] Failed to create pkt pool\n", __func__);
+		rte_ip_frag_table_destroy(frag_tbl);
+		return TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static void
+reassembly_test_teardown(void)
+{
+	if (frag_tbl != NULL)
+		rte_ip_frag_table_destroy(frag_tbl);
+
+	if (pkt_pool != NULL)
+		rte_mempool_free(pkt_pool);
+}
+
+static void
+randomize_array_positions(void **array, uint8_t sz)
+{
+	void *tmp;
+	int i, j;
+
+	if (sz == 2) {
+		tmp = array[0];
+		array[0] = array[1];
+		array[1] = tmp;
+	} else {
+		for (i = sz - 1; i > 0; i--) {
+			j = rte_rand_max(i + 1);
+			tmp = array[i];
+			array[i] = array[j];
+			array[j] = tmp;
+		}
+	}
+}
+
+static void
+reassembly_print_banner(const char *proto_str)
+{
+	printf("+=============================================================="
+	       "============================================+\n");
+	printf("| %-32s| %-3s : %-58d|\n", proto_str, "Flow Count", MAX_FLOWS);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+	printf("%-17s%-17s%-14s%-14s%-25s%-20s\n", "| Fragment Order",
+	       "| Fragments/Flow", "| Outstanding", "| Cycles/Flow",
+	       "| Cycles/Fragment insert", "| Cycles/Reassembly |");
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint32_t ip_cksum;
+		uint16_t pkt_len;
+		uint16_t *ptr16;
+
+		frag_offset = i * (frag_len / 8);
+
+		if (i == nb_frags - 1)
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+		else
+			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv4_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
+		ip_hdr->time_to_live = IP_DEFTTL;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id =
+			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
+		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
+		ip_hdr->src_addr = rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
+		ip_hdr->dst_addr = rte_cpu_to_be_32(IP_DST_ADDR(flow_id));
+
+		/*
+		 * Compute IP header checksum.
+		 */
+		ptr16 = (unaligned_uint16_t *)ip_hdr;
+		ip_cksum = 0;
+		ip_cksum += ptr16[0];
+		ip_cksum += ptr16[1];
+		ip_cksum += ptr16[2];
+		ip_cksum += ptr16[3];
+		ip_cksum += ptr16[4];
+		ip_cksum += ptr16[6];
+		ip_cksum += ptr16[7];
+		ip_cksum += ptr16[8];
+		ip_cksum += ptr16[9];
+
+		/*
+		 * Reduce 32 bit checksum to 16 bits and complement it.
+		 */
+		ip_cksum = ((ip_cksum & 0xFFFF0000) >> 16) +
+			   (ip_cksum & 0x0000FFFF);
+		if (ip_cksum > 65535)
+			ip_cksum -= 65535;
+		ip_cksum = (~ip_cksum) & 0x0000FFFF;
+		if (ip_cksum == 0)
+			ip_cksum = 0xFFFF;
+		ip_hdr->hdr_checksum = (uint16_t)ip_cksum;
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len = sizeof(struct rte_ipv4_hdr);
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static uint8_t
+get_rand_frags(uint8_t max_frag)
+{
+	uint8_t frags = rte_rand_max(max_frag + 1);
+
+	return frags <= 1 ? MIN_FRAGMENTS : frags;
+}
+
+static int
+ipv4_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+ipv6_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct ipv6_extension_fragment *frag_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint16_t pkt_len;
+
+		frag_offset = i * (frag_len / 8);
+		frag_offset <<= 3;
+		if (i == nb_frags - 1) {
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 0);
+		} else {
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 1);
+		}
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv6_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr) +
+				RTE_IPV6_FRAG_HDR_SIZE);
+		frag_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct ipv6_extension_fragment *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv6_hdr) +
+				     RTE_IPV6_FRAG_HDR_SIZE);
+		ip_hdr->vtc_flow = rte_cpu_to_be_32(IP6_VERSION << 28);
+		ip_hdr->payload_len =
+			rte_cpu_to_be_16(pkt_len - sizeof(struct rte_ipv6_hdr));
+		ip_hdr->proto = IPPROTO_FRAGMENT;
+		ip_hdr->hop_limits = IP_DEFTTL;
+		memcpy(ip_hdr->src_addr, ip6_addr, sizeof(ip_hdr->src_addr));
+		memcpy(ip_hdr->dst_addr, ip6_addr, sizeof(ip_hdr->dst_addr));
+		ip_hdr->src_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->src_addr[7] |= 0x10;
+		ip_hdr->src_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->src_addr[9] = flow_id & 0xff;
+
+		ip_hdr->dst_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->dst_addr[7] |= 0x20;
+		ip_hdr->dst_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->dst_addr[9] = flow_id & 0xff;
+
+		frag_hdr->next_header = IPPROTO_UDP;
+		frag_hdr->reserved = 0;
+		frag_hdr->frag_data = rte_cpu_to_be_16(frag_offset);
+		frag_hdr->id = rte_cpu_to_be_32(flow_id + 1);
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len =
+			sizeof(struct rte_ipv6_hdr) + RTE_IPV6_FRAG_HDR_SIZE;
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static int
+ipv6_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+frag_pkt_teardown(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < flow_cnt; i++)
+		rte_pktmbuf_free(mbufs[i][0]);
+}
+
+static void
+reassembly_print_stats(int8_t nb_frags, uint8_t fill_order,
+		       uint32_t outstanding, uint64_t cyc_per_flow,
+		       uint64_t cyc_per_frag_insert,
+		       uint64_t cyc_per_reassembly)
+{
+	char frag_str[8], order_str[12];
+
+	if (nb_frags > 0)
+		snprintf(frag_str, sizeof(frag_str), "%d", nb_frags);
+	else
+		snprintf(frag_str, sizeof(frag_str), "RANDOM");
+
+	switch (fill_order) {
+	case FILL_MODE_LINEAR:
+		snprintf(order_str, sizeof(order_str), "LINEAR");
+		break;
+	case FILL_MODE_RANDOM:
+		snprintf(order_str, sizeof(order_str), "RANDOM");
+		break;
+	case FILL_MODE_INTERLEAVED:
+		snprintf(order_str, sizeof(order_str), "INTERLEAVED");
+		break;
+	default:
+		break;
+	}
+
+	printf("| %-14s | %-14s | %-11d | %-11" PRIu64 " | %-22" PRIu64
+	       " | %-17" PRIu64 " |\n",
+	       order_str, frag_str, outstanding, cyc_per_flow,
+	       cyc_per_frag_insert, cyc_per_reassembly);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+join_array(struct rte_mbuf **dest_arr, struct rte_mbuf **src_arr,
+	   uint8_t offset, uint8_t sz)
+{
+	int i, j;
+
+	for (i = offset, j = 0; j < sz; i++, j++)
+		dest_arr[i] = src_arr[j];
+}
+
+static int
+ipv4_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_interleaved_flows_perf(uint8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_interleaved_flows_perf(int8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv4_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv4_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv4_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv4_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv4_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+ipv6_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv6_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv6_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv6_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv6_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv6_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+test_reassembly_perf(void)
+{
+	int8_t nb_fragments[] = {2, 3, MAX_FRAGMENTS, -1 /* Random */};
+	uint8_t order_type[] = {FILL_MODE_LINEAR, FILL_MODE_RANDOM};
+	uint32_t outstanding[] = {100, 500, 1000, 2000, 3000};
+	uint32_t i, j;
+	int rc;
+
+	rc = reassembly_test_setup();
+	if (rc)
+		return rc;
+
+	reassembly_print_banner("IPV4");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv4_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv4_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	printf("\n");
+	reassembly_print_banner("IPV6");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv6_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv6_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	reassembly_test_teardown();
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(reassembly_perf_autotest, test_reassembly_perf);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2 1/3] ip_frag: optimize key compare and hash generation
  2023-05-23 12:54 [PATCH 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-05-23 12:54 ` [PATCH 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
  2023-05-23 12:54 ` [PATCH 3/3] test: add reassembly perf test pbhagavatula
@ 2023-05-23 14:39 ` pbhagavatula
  2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
                     ` (2 more replies)
  2 siblings, 3 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-23 14:39 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd, Ruifeng Wang, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use optimized rte_hash_k32_cmp_eq routine for key comparison for
x86 and ARM64.
Use CRC instructions for hash generation on ARM64.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation failure with non ARM64/x86 targets

 lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
 lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
 lib/ip_frag/ip_frag_common.h   | 14 +++++++++++++-
 lib/ip_frag/ip_frag_internal.c |  4 ++--
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
index e9e26f9abd..a3e85635eb 100644
--- a/lib/hash/rte_cmp_arm64.h
+++ b/lib/hash/rte_cmp_arm64.h
@@ -3,7 +3,7 @@
  */

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 		    size_t key_len __rte_unused)
 {
@@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 	return !(x0 == 0 && x1 == 0);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
index 13a5836351..ddfbef462f 100644
--- a/lib/hash/rte_cmp_x86.h
+++ b/lib/hash/rte_cmp_x86.h
@@ -5,7 +5,7 @@
 #include <rte_vect.h>

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
 {
 	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
@@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
 	return !_mm_test_all_zeros(x, x);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
index 0d8ce6a1e1..5cdd98c8fe 100644
--- a/lib/ip_frag/ip_frag_common.h
+++ b/lib/ip_frag/ip_frag_common.h
@@ -5,7 +5,13 @@
 #ifndef _IP_FRAG_COMMON_H_
 #define _IP_FRAG_COMMON_H_

-#include <sys/queue.h>
+#include <rte_common.h>
+
+#if defined(RTE_ARCH_ARM64)
+#include <rte_cmp_arm64.h>
+#elif defined(RTE_ARCH_X86)
+#include <rte_cmp_x86.h>
+#endif

 #include "rte_ip_frag.h"
 #include "ip_reassembly.h"
@@ -75,12 +81,18 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
 static inline uint64_t
 ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
 {
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
+	return (k1->id_key_len != k2->id_key_len) ||
+	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
+					     rte_hash_k32_cmp_eq(k1, k2, 32));
+#else
 	uint32_t i;
 	uint64_t val;
 	val = k1->id_key_len ^ k2->id_key_len;
 	for (i = 0; i < k1->key_len; i++)
 		val |= k1->src_dst[i] ^ k2->src_dst[i];
 	return val;
+#endif
 }

 /*
diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index b436a4c931..7cbef647df 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *)&key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(key->id, v);
@@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *) &key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(p[2], v);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
@ 2023-05-23 14:39   ` pbhagavatula
  2023-05-23 16:22     ` Honnappa Nagarahalli
  2023-05-23 22:30     ` Stephen Hemminger
  2023-05-23 14:39   ` [PATCH v2 3/3] test: add reassembly perf test pbhagavatula
  2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  2 siblings, 2 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-23 14:39 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd, Konstantin Ananyev; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve reassembly lookup performance by using NEON intrinsics for
key validation.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
 lib/ip_frag/ip_reassembly.h      |   6 +
 lib/ip_frag/rte_ip_frag_common.c |  10 ++
 3 files changed, 196 insertions(+), 44 deletions(-)

diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index 7cbef647df..de78a0ed8f 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -4,8 +4,9 @@
 
 #include <stddef.h>
 
-#include <rte_jhash.h>
 #include <rte_hash_crc.h>
+#include <rte_jhash.h>
+#include <rte_vect.h>
 
 #include "ip_frag_common.h"
 
@@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
 	return pkt;
 }
 
-struct ip_frag_pkt *
-ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
-	const struct ip_frag_key *key, uint64_t tms,
-	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+static inline void
+ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
+	    uint32_t list_idx, uint32_t list_cnt)
+{
+	RTE_SET_USED(tbl);
+	RTE_SET_USED(list_idx);
+	RTE_SET_USED(list_cnt);
+	if (p->key.key_len == IPV4_KEYLEN)
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    p->key.src_dst[0], p->key.id, p->start);
+	else
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <" IPv6_KEY_BYTES_FMT
+			    ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
+			    p->start);
+}
+
+#if defined(RTE_ARCH_ARM64)
+static inline struct ip_frag_pkt *
+ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	struct ip_frag_pkt *empty, *old;
+	struct ip_frag_pkt *p1, *p2;
+	uint32_t assoc, sig1, sig2;
+	uint64_t max_cycles;
+
+	empty = NULL;
+	old = NULL;
+
+	max_cycles = tbl->max_cycles;
+	assoc = tbl->bucket_entries;
+
+	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
+		return tbl->last;
+
+	/* different hashing methods for IPv4 and IPv6 */
+	if (key->key_len == IPV4_KEYLEN)
+		ipv4_frag_hash(key, &sig1, &sig2);
+	else
+		ipv6_frag_hash(key, &sig1, &sig2);
+
+	p1 = IP_FRAG_TBL_POS(tbl, sig1);
+	p2 = IP_FRAG_TBL_POS(tbl, sig2);
+
+	uint64x2_t key0, key1, key2, key3;
+	uint64_t vmask, zmask, ts_mask;
+	uint64x2_t ts0, ts1;
+	uint32x4_t nz_key;
+	uint8_t idx;
+	/* Bucket entries are always power of 2. */
+	rte_prefetch0(&p1[0].key);
+	rte_prefetch0(&p1[1].key);
+	rte_prefetch0(&p2[0].key);
+	rte_prefetch0(&p2[1].key);
+
+	while (assoc > 1) {
+		if (assoc > 2) {
+			rte_prefetch0(&p1[2].key);
+			rte_prefetch0(&p1[3].key);
+			rte_prefetch0(&p2[2].key);
+			rte_prefetch0(&p2[3].key);
+		}
+		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
+		key0 = vld1q_u64(&p[0]->key.id_key_len);
+		key1 = vld1q_u64(&p[1]->key.id_key_len);
+		key2 = vld1q_u64(&p[2]->key.id_key_len);
+		key3 = vld1q_u64(&p[3]->key.id_key_len);
+
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3), 1), nz_key, 3);
+
+		nz_key = vceqzq_u32(nz_key);
+		zmask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
+		vmask = ~zmask;
+
+		vmask &= 0x8000800080008000;
+		for (; vmask > 0; vmask &= vmask - 1) {
+			idx = __builtin_ctzll(vmask) >> 4;
+			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
+				return p[idx];
+		}
+
+		vmask = ~zmask;
+		if (zmask && empty == NULL) {
+			zmask &= 0x8000800080008000;
+			idx = __builtin_ctzll(zmask) >> 4;
+			empty = p[idx];
+		}
+
+		if (vmask && old == NULL) {
+			const uint64x2_t max_cyc = vdupq_n_u64(max_cycles);
+			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
+
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0, 0);
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0, 1);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1, 0);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1, 1);
+
+			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
+			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
+
+			ts_mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
+							vuzp1q_u32(vreinterpretq_u32_u64(ts0),
+								   vreinterpretq_u32_u64(ts1)),
+							16)),
+						0);
+			vmask &= 0x8000800080008000;
+			ts_mask &= vmask;
+			if (ts_mask) {
+				idx = __builtin_ctzll(ts_mask) >> 4;
+				old = p[idx];
+			}
+		}
+		p1 += 2;
+		p2 += 2;
+		assoc -= 4;
+	}
+	while (assoc) {
+		if (ip_frag_key_cmp(key, &p1->key) == 0)
+			return p1;
+		else if (ip_frag_key_is_empty(&p1->key))
+			empty = (empty == NULL) ? p1 : empty;
+		else if (max_cycles + p1->start < tms)
+			old = (old == NULL) ? p1 : old;
+
+		if (ip_frag_key_cmp(key, &p2->key) == 0)
+			return p2;
+		else if (ip_frag_key_is_empty(&p2->key))
+			empty = (empty == NULL) ? p2 : empty;
+		else if (max_cycles + p2->start < tms)
+			old = (old == NULL) ? p2 : old;
+		p1++;
+		p2++;
+		assoc--;
+	}
+
+	*free = empty;
+	*stale = old;
+	return NULL;
+}
+#endif
+
+static struct ip_frag_pkt *
+ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
 {
 	struct ip_frag_pkt *p1, *p2;
 	struct ip_frag_pkt *empty, *old;
@@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	p2 = IP_FRAG_TBL_POS(tbl, sig2);
 
 	for (i = 0; i != assoc; i++) {
-		if (p1->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start);
-
+		ip_frag_dbg(tbl, &p1[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
 			return p1 + i;
 		else if (ip_frag_key_is_empty(&p1[i].key))
@@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 		else if (max_cycles + p1[i].start < tms)
 			old = (old == NULL) ? (p1 + i) : old;
 
-		if (p2->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start);
-
+		ip_frag_dbg(tbl, &p2[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
 			return p2 + i;
 		else if (ip_frag_key_is_empty(&p2[i].key))
-			empty = (empty == NULL) ?( p2 + i) : empty;
+			empty = (empty == NULL) ? (p2 + i) : empty;
 		else if (max_cycles + p2[i].start < tms)
 			old = (old == NULL) ? (p2 + i) : old;
 	}
@@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	*stale = old;
 	return NULL;
 }
+
+struct ip_frag_pkt *
+ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	switch (tbl->lookup_fn) {
+#if defined(RTE_ARCH_ARM64)
+	case REASSEMBLY_LOOKUP_NEON:
+		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
+#endif
+	case REASSEMBLY_LOOKUP_SCALAR:
+	default:
+		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
+	}
+}
diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
index ef9d8c0d75..049437ae32 100644
--- a/lib/ip_frag/ip_reassembly.h
+++ b/lib/ip_frag/ip_reassembly.h
@@ -12,6 +12,11 @@
 
 #include <rte_ip_frag.h>
 
+enum ip_frag_lookup_func {
+	REASSEMBLY_LOOKUP_SCALAR = 0,
+	REASSEMBLY_LOOKUP_NEON,
+};
+
 enum {
 	IP_LAST_FRAG_IDX,    /* index of last fragment */
 	IP_FIRST_FRAG_IDX,   /* index of first fragment */
@@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
 	struct ip_frag_pkt *last;     /* last used entry. */
 	struct ip_pkt_list lru;       /* LRU list for table entries. */
 	struct ip_frag_tbl_stat stat; /* statistics counters. */
+	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function. */
 	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */
 };
 
diff --git a/lib/ip_frag/rte_ip_frag_common.c b/lib/ip_frag/rte_ip_frag_common.c
index c1de2e81b6..ef3c104e45 100644
--- a/lib/ip_frag/rte_ip_frag_common.c
+++ b/lib/ip_frag/rte_ip_frag_common.c
@@ -5,7 +5,9 @@
 #include <stddef.h>
 #include <stdio.h>
 
+#include <rte_cpuflags.h>
 #include <rte_log.h>
+#include <rte_vect.h>
 
 #include "ip_frag_common.h"
 
@@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 	tbl->bucket_entries = bucket_entries;
 	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
 
+#if defined(RTE_ARCH_ARM64)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
+	else
+#endif
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
+
 	TAILQ_INIT(&(tbl->lru));
 	return tbl;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2 3/3] test: add reassembly perf test
  2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
@ 2023-05-23 14:39   ` pbhagavatula
  2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  2 siblings, 0 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-23 14:39 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
Each test is performed with variable number of fragments per flow,
either ordered or unordered fragments and interleaved flows.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test/meson.build            |    2 +
 app/test/test_reassembly_perf.c | 1001 +++++++++++++++++++++++++++++++
 2 files changed, 1003 insertions(+)
 create mode 100644 app/test/test_reassembly_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index b9b5432496..8cc4f03db8 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -108,6 +108,7 @@ test_sources = files(
         'test_rawdev.c',
         'test_rcu_qsbr.c',
         'test_rcu_qsbr_perf.c',
+        'test_reassembly_perf.c',
         'test_reciprocal_division.c',
         'test_reciprocal_division_perf.c',
         'test_red.c',
@@ -297,6 +298,7 @@ perf_test_names = [
         'trace_perf_autotest',
         'ipsec_perf_autotest',
         'thash_perf_autotest',
+        'reassembly_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_reassembly_perf.c b/app/test/test_reassembly_perf.c
new file mode 100644
index 0000000000..850485a9c5
--- /dev/null
+++ b/app/test/test_reassembly_perf.c
@@ -0,0 +1,1001 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell.
+ */
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_hexdump.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_random.h>
+#include <rte_udp.h>
+
+#include "test.h"
+
+#define MAX_FLOWS	    (1024 * 32)
+#define MAX_BKTS	    MAX_FLOWS
+#define MAX_ENTRIES_PER_BKT 16
+#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define MIN_FRAGMENTS	    2
+#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
+
+#define MAX_PKT_LEN 2048
+#define MAX_TTL_MS  (5 * MS_PER_S)
+
+/* use RFC863 Discard Protocol */
+#define UDP_SRC_PORT 9
+#define UDP_DST_PORT 9
+
+/* use RFC5735 / RFC2544 reserved network test addresses */
+#define IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x))
+#define IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 8) | (x))
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#define IP6_VERSION 6
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+
+static struct rte_ip_frag_tbl *frag_tbl;
+static struct rte_mempool *pkt_pool;
+static struct rte_mbuf *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
+static uint8_t frag_per_flow[MAX_FLOWS];
+static uint32_t flow_cnt;
+
+#define FILL_MODE_LINEAR      0
+#define FILL_MODE_RANDOM      1
+#define FILL_MODE_INTERLEAVED 2
+
+static int
+reassembly_test_setup(void)
+{
+	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
+
+	frag_tbl = rte_ip_frag_table_create(MAX_FLOWS, MAX_ENTRIES_PER_BKT,
+					    MAX_FLOWS * MAX_ENTRIES_PER_BKT,
+					    max_ttl_cyc, rte_socket_id());
+	if (frag_tbl == NULL)
+		return TEST_FAILED;
+
+	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
+	pkt_pool = rte_pktmbuf_pool_create(
+		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS, 0, 0,
+		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
+	if (pkt_pool == NULL) {
+		printf("[%s] Failed to create pkt pool\n", __func__);
+		rte_ip_frag_table_destroy(frag_tbl);
+		return TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static void
+reassembly_test_teardown(void)
+{
+	if (frag_tbl != NULL)
+		rte_ip_frag_table_destroy(frag_tbl);
+
+	if (pkt_pool != NULL)
+		rte_mempool_free(pkt_pool);
+}
+
+static void
+randomize_array_positions(void **array, uint8_t sz)
+{
+	void *tmp;
+	int i, j;
+
+	if (sz == 2) {
+		tmp = array[0];
+		array[0] = array[1];
+		array[1] = tmp;
+	} else {
+		for (i = sz - 1; i > 0; i--) {
+			j = rte_rand_max(i + 1);
+			tmp = array[i];
+			array[i] = array[j];
+			array[j] = tmp;
+		}
+	}
+}
+
+static void
+reassembly_print_banner(const char *proto_str)
+{
+	printf("+=============================================================="
+	       "============================================+\n");
+	printf("| %-32s| %-3s : %-58d|\n", proto_str, "Flow Count", MAX_FLOWS);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+	printf("%-17s%-17s%-14s%-14s%-25s%-20s\n", "| Fragment Order",
+	       "| Fragments/Flow", "| Outstanding", "| Cycles/Flow",
+	       "| Cycles/Fragment insert", "| Cycles/Reassembly |");
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint32_t ip_cksum;
+		uint16_t pkt_len;
+		uint16_t *ptr16;
+
+		frag_offset = i * (frag_len / 8);
+
+		if (i == nb_frags - 1)
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+		else
+			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv4_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
+		ip_hdr->time_to_live = IP_DEFTTL;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id =
+			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
+		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
+		ip_hdr->src_addr = rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
+		ip_hdr->dst_addr = rte_cpu_to_be_32(IP_DST_ADDR(flow_id));
+
+		/*
+		 * Compute IP header checksum.
+		 */
+		ptr16 = (unaligned_uint16_t *)ip_hdr;
+		ip_cksum = 0;
+		ip_cksum += ptr16[0];
+		ip_cksum += ptr16[1];
+		ip_cksum += ptr16[2];
+		ip_cksum += ptr16[3];
+		ip_cksum += ptr16[4];
+		ip_cksum += ptr16[6];
+		ip_cksum += ptr16[7];
+		ip_cksum += ptr16[8];
+		ip_cksum += ptr16[9];
+
+		/*
+		 * Reduce 32 bit checksum to 16 bits and complement it.
+		 */
+		ip_cksum = ((ip_cksum & 0xFFFF0000) >> 16) +
+			   (ip_cksum & 0x0000FFFF);
+		if (ip_cksum > 65535)
+			ip_cksum -= 65535;
+		ip_cksum = (~ip_cksum) & 0x0000FFFF;
+		if (ip_cksum == 0)
+			ip_cksum = 0xFFFF;
+		ip_hdr->hdr_checksum = (uint16_t)ip_cksum;
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len = sizeof(struct rte_ipv4_hdr);
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static uint8_t
+get_rand_frags(uint8_t max_frag)
+{
+	uint8_t frags = rte_rand_max(max_frag + 1);
+
+	return frags <= 1 ? MIN_FRAGMENTS : frags;
+}
+
+static int
+ipv4_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+ipv6_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct ipv6_extension_fragment *frag_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint16_t pkt_len;
+
+		frag_offset = i * (frag_len / 8);
+		frag_offset <<= 3;
+		if (i == nb_frags - 1) {
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 0);
+		} else {
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 1);
+		}
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv6_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr) +
+				RTE_IPV6_FRAG_HDR_SIZE);
+		frag_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct ipv6_extension_fragment *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv6_hdr) +
+				     RTE_IPV6_FRAG_HDR_SIZE);
+		ip_hdr->vtc_flow = rte_cpu_to_be_32(IP6_VERSION << 28);
+		ip_hdr->payload_len =
+			rte_cpu_to_be_16(pkt_len - sizeof(struct rte_ipv6_hdr));
+		ip_hdr->proto = IPPROTO_FRAGMENT;
+		ip_hdr->hop_limits = IP_DEFTTL;
+		memcpy(ip_hdr->src_addr, ip6_addr, sizeof(ip_hdr->src_addr));
+		memcpy(ip_hdr->dst_addr, ip6_addr, sizeof(ip_hdr->dst_addr));
+		ip_hdr->src_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->src_addr[7] |= 0x10;
+		ip_hdr->src_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->src_addr[9] = flow_id & 0xff;
+
+		ip_hdr->dst_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->dst_addr[7] |= 0x20;
+		ip_hdr->dst_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->dst_addr[9] = flow_id & 0xff;
+
+		frag_hdr->next_header = IPPROTO_UDP;
+		frag_hdr->reserved = 0;
+		frag_hdr->frag_data = rte_cpu_to_be_16(frag_offset);
+		frag_hdr->id = rte_cpu_to_be_32(flow_id + 1);
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len =
+			sizeof(struct rte_ipv6_hdr) + RTE_IPV6_FRAG_HDR_SIZE;
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static int
+ipv6_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+frag_pkt_teardown(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < flow_cnt; i++)
+		rte_pktmbuf_free(mbufs[i][0]);
+}
+
+static void
+reassembly_print_stats(int8_t nb_frags, uint8_t fill_order,
+		       uint32_t outstanding, uint64_t cyc_per_flow,
+		       uint64_t cyc_per_frag_insert,
+		       uint64_t cyc_per_reassembly)
+{
+	char frag_str[8], order_str[12];
+
+	if (nb_frags > 0)
+		snprintf(frag_str, sizeof(frag_str), "%d", nb_frags);
+	else
+		snprintf(frag_str, sizeof(frag_str), "RANDOM");
+
+	switch (fill_order) {
+	case FILL_MODE_LINEAR:
+		snprintf(order_str, sizeof(order_str), "LINEAR");
+		break;
+	case FILL_MODE_RANDOM:
+		snprintf(order_str, sizeof(order_str), "RANDOM");
+		break;
+	case FILL_MODE_INTERLEAVED:
+		snprintf(order_str, sizeof(order_str), "INTERLEAVED");
+		break;
+	default:
+		break;
+	}
+
+	printf("| %-14s | %-14s | %-11d | %-11" PRIu64 " | %-22" PRIu64
+	       " | %-17" PRIu64 " |\n",
+	       order_str, frag_str, outstanding, cyc_per_flow,
+	       cyc_per_frag_insert, cyc_per_reassembly);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+join_array(struct rte_mbuf **dest_arr, struct rte_mbuf **src_arr,
+	   uint8_t offset, uint8_t sz)
+{
+	int i, j;
+
+	for (i = offset, j = 0; j < sz; i++, j++)
+		dest_arr[i] = src_arr[j];
+}
+
+static int
+ipv4_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_interleaved_flows_perf(uint8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_interleaved_flows_perf(int8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv4_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv4_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv4_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv4_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv4_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+ipv6_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv6_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv6_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv6_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv6_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv6_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+test_reassembly_perf(void)
+{
+	int8_t nb_fragments[] = {2, 3, MAX_FRAGMENTS, -1 /* Random */};
+	uint8_t order_type[] = {FILL_MODE_LINEAR, FILL_MODE_RANDOM};
+	uint32_t outstanding[] = {100, 500, 1000, 2000, 3000};
+	uint32_t i, j;
+	int rc;
+
+	rc = reassembly_test_setup();
+	if (rc)
+		return rc;
+
+	reassembly_print_banner("IPV4");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv4_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv4_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	printf("\n");
+	reassembly_print_banner("IPV6");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv6_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv6_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	reassembly_test_teardown();
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(reassembly_perf_autotest, test_reassembly_perf);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
@ 2023-05-23 16:22     ` Honnappa Nagarahalli
  2023-05-23 17:58       ` Pavan Nikhilesh Bhagavatula
  2023-05-23 22:30     ` Stephen Hemminger
  1 sibling, 1 reply; 28+ messages in thread
From: Honnappa Nagarahalli @ 2023-05-23 16:22 UTC (permalink / raw)
  To: pbhagavatula, jerinj, nd, Konstantin Ananyev; +Cc: dev, nd, nd



> -----Original Message-----
> From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> Sent: Tuesday, May 23, 2023 9:39 AM
> To: jerinj@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin Ananyev
> <konstantin.v.ananyev@yandex.ru>
> Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
> 
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Improve reassembly lookup performance by using NEON intrinsics for key
> validation.
What is the improvement do you see with this?

> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
>  lib/ip_frag/ip_reassembly.h      |   6 +
>  lib/ip_frag/rte_ip_frag_common.c |  10 ++
>  3 files changed, 196 insertions(+), 44 deletions(-)
> 
> diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c index
> 7cbef647df..de78a0ed8f 100644
> --- a/lib/ip_frag/ip_frag_internal.c
> +++ b/lib/ip_frag/ip_frag_internal.c
> @@ -4,8 +4,9 @@
> 
>  #include <stddef.h>
> 
> -#include <rte_jhash.h>
>  #include <rte_hash_crc.h>
> +#include <rte_jhash.h>
> +#include <rte_vect.h>
> 
>  #include "ip_frag_common.h"
> 
> @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> rte_ip_frag_death_row *dr,
>  	return pkt;
>  }
> 
> -struct ip_frag_pkt *
> -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> -	const struct ip_frag_key *key, uint64_t tms,
> -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> +static inline void
> +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> +	    uint32_t list_idx, uint32_t list_cnt) {
> +	RTE_SET_USED(tbl);
> +	RTE_SET_USED(list_idx);
> +	RTE_SET_USED(list_cnt);
> +	if (p->key.key_len == IPV4_KEYLEN)
> +		IP_FRAG_LOG(DEBUG,
> +			    "%s:%d:\n"
> +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> +			    __func__, __LINE__, tbl, tbl->max_entries,
> +			    tbl->use_entries, p, list_idx, list_cnt,
> +			    p->key.src_dst[0], p->key.id, p->start);
> +	else
> +		IP_FRAG_LOG(DEBUG,
> +			    "%s:%d:\n"
> +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> +			    "key: <" IPv6_KEY_BYTES_FMT
> +			    ", %#x>, start: %" PRIu64 "\n",
> +			    __func__, __LINE__, tbl, tbl->max_entries,
> +			    tbl->use_entries, p, list_idx, list_cnt,
> +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> +			    p->start);
> +}
> +
> +#if defined(RTE_ARCH_ARM64)
> +static inline struct ip_frag_pkt *
> +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key, uint64_t tms,
> +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> +	struct ip_frag_pkt *empty, *old;
> +	struct ip_frag_pkt *p1, *p2;
> +	uint32_t assoc, sig1, sig2;
> +	uint64_t max_cycles;
> +
> +	empty = NULL;
> +	old = NULL;
> +
> +	max_cycles = tbl->max_cycles;
> +	assoc = tbl->bucket_entries;
> +
> +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> +		return tbl->last;
> +
> +	/* different hashing methods for IPv4 and IPv6 */
> +	if (key->key_len == IPV4_KEYLEN)
> +		ipv4_frag_hash(key, &sig1, &sig2);
> +	else
> +		ipv6_frag_hash(key, &sig1, &sig2);
> +
> +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> +
> +	uint64x2_t key0, key1, key2, key3;
> +	uint64_t vmask, zmask, ts_mask;
> +	uint64x2_t ts0, ts1;
> +	uint32x4_t nz_key;
> +	uint8_t idx;
> +	/* Bucket entries are always power of 2. */
> +	rte_prefetch0(&p1[0].key);
> +	rte_prefetch0(&p1[1].key);
> +	rte_prefetch0(&p2[0].key);
> +	rte_prefetch0(&p2[1].key);
> +
> +	while (assoc > 1) {
> +		if (assoc > 2) {
> +			rte_prefetch0(&p1[2].key);
> +			rte_prefetch0(&p1[3].key);
> +			rte_prefetch0(&p2[2].key);
> +			rte_prefetch0(&p2[3].key);
> +		}
> +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> +
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> +1), nz_key, 3);
> +
> +		nz_key = vceqzq_u32(nz_key);
> +		zmask =
> vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> +		vmask = ~zmask;
> +
> +		vmask &= 0x8000800080008000;
> +		for (; vmask > 0; vmask &= vmask - 1) {
> +			idx = __builtin_ctzll(vmask) >> 4;
> +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> +				return p[idx];
> +		}
> +
> +		vmask = ~zmask;
> +		if (zmask && empty == NULL) {
> +			zmask &= 0x8000800080008000;
> +			idx = __builtin_ctzll(zmask) >> 4;
> +			empty = p[idx];
> +		}
> +
> +		if (vmask && old == NULL) {
> +			const uint64x2_t max_cyc =
> vdupq_n_u64(max_cycles);
> +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> +
> +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> 0);
> +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> 1);
> +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> 0);
> +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> 1);
> +
> +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> +
> +			ts_mask =
> vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> +
> 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> +
> vreinterpretq_u32_u64(ts1)),
> +							16)),
> +						0);
> +			vmask &= 0x8000800080008000;
> +			ts_mask &= vmask;
> +			if (ts_mask) {
> +				idx = __builtin_ctzll(ts_mask) >> 4;
> +				old = p[idx];
> +			}
> +		}
> +		p1 += 2;
> +		p2 += 2;
> +		assoc -= 4;
> +	}
> +	while (assoc) {
> +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> +			return p1;
> +		else if (ip_frag_key_is_empty(&p1->key))
> +			empty = (empty == NULL) ? p1 : empty;
> +		else if (max_cycles + p1->start < tms)
> +			old = (old == NULL) ? p1 : old;
> +
> +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> +			return p2;
> +		else if (ip_frag_key_is_empty(&p2->key))
> +			empty = (empty == NULL) ? p2 : empty;
> +		else if (max_cycles + p2->start < tms)
> +			old = (old == NULL) ? p2 : old;
> +		p1++;
> +		p2++;
> +		assoc--;
> +	}
> +
> +	*free = empty;
> +	*stale = old;
> +	return NULL;
> +}
> +#endif
> +
> +static struct ip_frag_pkt *
> +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key, uint64_t tms,
> +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
>  {
>  	struct ip_frag_pkt *p1, *p2;
>  	struct ip_frag_pkt *empty, *old;
> @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> 
>  	for (i = 0; i != assoc; i++) {
> -		if (p1->key.key_len == IPV4_KEYLEN)
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv4_frag_pkt line0: %p, index: %u
> from %u\n"
> -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p1, i, assoc,
> -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> -		else
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv6_frag_pkt line0: %p, index: %u
> from %u\n"
> -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p1, i, assoc,
> -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> p1[i].start);
> -
> +		ip_frag_dbg(tbl, &p1[i], i, assoc);
>  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
>  			return p1 + i;
>  		else if (ip_frag_key_is_empty(&p1[i].key))
> @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  		else if (max_cycles + p1[i].start < tms)
>  			old = (old == NULL) ? (p1 + i) : old;
> 
> -		if (p2->key.key_len == IPV4_KEYLEN)
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv4_frag_pkt line1: %p, index: %u
> from %u\n"
> -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p2, i, assoc,
> -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> -		else
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv6_frag_pkt line1: %p, index: %u
> from %u\n"
> -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p2, i, assoc,
> -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> p2[i].start);
> -
> +		ip_frag_dbg(tbl, &p2[i], i, assoc);
>  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
>  			return p2 + i;
>  		else if (ip_frag_key_is_empty(&p2[i].key))
> -			empty = (empty == NULL) ?( p2 + i) : empty;
> +			empty = (empty == NULL) ? (p2 + i) : empty;
>  		else if (max_cycles + p2[i].start < tms)
>  			old = (old == NULL) ? (p2 + i) : old;
>  	}
> @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  	*stale = old;
>  	return NULL;
>  }
> +
> +struct ip_frag_pkt *
> +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key,
> uint64_t tms,
> +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> +	switch (tbl->lookup_fn) {
> +#if defined(RTE_ARCH_ARM64)
> +	case REASSEMBLY_LOOKUP_NEON:
> +		return ip_frag_lookup_neon(tbl, key, tms, free, stale); #endif
> +	case REASSEMBLY_LOOKUP_SCALAR:
> +	default:
> +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> +	}
> +}
> diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index
> ef9d8c0d75..049437ae32 100644
> --- a/lib/ip_frag/ip_reassembly.h
> +++ b/lib/ip_frag/ip_reassembly.h
> @@ -12,6 +12,11 @@
> 
>  #include <rte_ip_frag.h>
> 
> +enum ip_frag_lookup_func {
> +	REASSEMBLY_LOOKUP_SCALAR = 0,
> +	REASSEMBLY_LOOKUP_NEON,
> +};
> +
>  enum {
>  	IP_LAST_FRAG_IDX,    /* index of last fragment */
>  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
>  	struct ip_frag_pkt *last;     /* last used entry. */
>  	struct ip_pkt_list lru;       /* LRU list for table entries. */
>  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function.
> */
>  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> 
> diff --git a/lib/ip_frag/rte_ip_frag_common.c
> b/lib/ip_frag/rte_ip_frag_common.c
> index c1de2e81b6..ef3c104e45 100644
> --- a/lib/ip_frag/rte_ip_frag_common.c
> +++ b/lib/ip_frag/rte_ip_frag_common.c
> @@ -5,7 +5,9 @@
>  #include <stddef.h>
>  #include <stdio.h>
> 
> +#include <rte_cpuflags.h>
>  #include <rte_log.h>
> +#include <rte_vect.h>
> 
>  #include "ip_frag_common.h"
> 
> @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> uint32_t bucket_entries,
>  	tbl->bucket_entries = bucket_entries;
>  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> 
> +#if defined(RTE_ARCH_ARM64)
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> +	else
> +#endif
> +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> +
>  	TAILQ_INIT(&(tbl->lru));
>  	return tbl;
>  }
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 16:22     ` Honnappa Nagarahalli
@ 2023-05-23 17:58       ` Pavan Nikhilesh Bhagavatula
  2023-05-23 22:23         ` Pavan Nikhilesh Bhagavatula
  0 siblings, 1 reply; 28+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-05-23 17:58 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Jerin Jacob Kollanukkaran, nd, Konstantin Ananyev
  Cc: dev, nd, nd

> > -----Original Message-----
> > From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> > Sent: Tuesday, May 23, 2023 9:39 AM
> > To: jerinj@marvell.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin
> Ananyev
> > <konstantin.v.ananyev@yandex.ru>
> > Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> > Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
> >
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Improve reassembly lookup performance by using NEON intrinsics for key
> > validation.
> What is the improvement do you see with this?

On Neoverse-N2 I see around improvement of 300-600c per flow and ~200c per insert.

Here are some test results.

Without patch:
+==========================================================================================================+
| IPV4                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 1244        | 919                    | 114               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1653        | 968                    | 128               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1379        | 503                    | 110               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 1613        | 520                    | 139               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 2030        | 199                    | 190               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 4393        | 309                    | 402               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 1531        | 333                    | 147               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 2771        | 357                    | 213               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 1228        | 920                    | 102               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 1197        | 905                    | 103               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 1183        | 904                    | 104               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 1153        | 921                    | 105               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 1123        | 911                    | 111               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 829         | 193                    | 690               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 830         | 195                    | 682               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 817         | 211                    | 690               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 819         | 195                    | 690               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 823         | 223                    | 676               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 1765        | 1038                   | 177               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2588        | 699                    | 190               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 5253        | 265                    | 403               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 3398        | 493                    | 301               |
+================+================+=============+=============+========================+===================+

+==========================================================================================================+
| IPV6                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 1838        | 1176                   | 136               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1892        | 1188                   | 160               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1986        | 628                    | 143               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 2670        | 646                    | 155               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 3152        | 261                    | 271               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 5127        | 324                    | 434               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 2169        | 427                    | 203               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 3382        | 452                    | 255               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 1837        | 1164                   | 124               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 1790        | 1158                   | 126               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 1807        | 1161                   | 138               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 1776        | 1160                   | 138               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 1715        | 1169                   | 144               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 1488        | 256                    | 1228              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 1461        | 300                    | 1205              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 1457        | 303                    | 1202              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 1456        | 305                    | 1201              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 1460        | 308                    | 1205              |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 2145        | 1330                   | 296               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2778        | 830                    | 330               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 5715        | 324                    | 444               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 3625        | 550                    | 363               |
+================+================+=============+=============+========================+===================+

With patch :

+==========================================================================================================+
| IPV4                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 950         | 717                    | 98                |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1013        | 706                    | 108               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1096        | 397                    | 115               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 1150        | 412                    | 128               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 1783        | 166                    | 202               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 3933        | 284                    | 424               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 1288        | 267                    | 159               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 2393        | 302                    | 235               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 956         | 703                    | 110               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 937         | 693                    | 112               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 912         | 670                    | 121               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 908         | 688                    | 122               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 894         | 688                    | 128               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 1019        | 179                    | 865               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 1052        | 176                    | 895               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 1130        | 180                    | 1003              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 1143        | 180                    | 1020              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 1130        | 181                    | 985               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 1582        | 710                    | 168               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2162        | 446                    | 194               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 4997        | 214                    | 426               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 2921        | 341                    | 311               |
+================+================+=============+=============+========================+===================+

+==========================================================================================================+
| IPV6                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 1275        | 687                    | 125               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1335        | 721                    | 169               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1388        | 415                    | 169               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 2117        | 393                    | 163               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 2811        | 172                    | 241               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 4322        | 227                    | 401               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 1730        | 270                    | 192               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 2839        | 317                    | 264               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 1152        | 662                    | 126               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 1107        | 658                    | 130               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 1190        | 647                    | 138               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 1086        | 635                    | 141               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 1064        | 645                    | 150               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 1560        | 172                    | 1296              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 1536        | 226                    | 1274              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 1543        | 228                    | 1282              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 1548        | 228                    | 1287              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 1541        | 227                    | 1280              |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 1585        | 769                    | 281               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2222        | 536                    | 327               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 4962        | 232                    | 439               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 2998        | 373                    | 360               |
+================+================+=============+=============+========================+===================+

> 
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
> >  lib/ip_frag/ip_reassembly.h      |   6 +
> >  lib/ip_frag/rte_ip_frag_common.c |  10 ++
> >  3 files changed, 196 insertions(+), 44 deletions(-)
> >
> > diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
> index
> > 7cbef647df..de78a0ed8f 100644
> > --- a/lib/ip_frag/ip_frag_internal.c
> > +++ b/lib/ip_frag/ip_frag_internal.c
> > @@ -4,8 +4,9 @@
> >
> >  #include <stddef.h>
> >
> > -#include <rte_jhash.h>
> >  #include <rte_hash_crc.h>
> > +#include <rte_jhash.h>
> > +#include <rte_vect.h>
> >
> >  #include "ip_frag_common.h"
> >
> > @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> > rte_ip_frag_death_row *dr,
> >  	return pkt;
> >  }
> >
> > -struct ip_frag_pkt *
> > -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > -	const struct ip_frag_key *key, uint64_t tms,
> > -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> > +static inline void
> > +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> > +	    uint32_t list_idx, uint32_t list_cnt) {
> > +	RTE_SET_USED(tbl);
> > +	RTE_SET_USED(list_idx);
> > +	RTE_SET_USED(list_cnt);
> > +	if (p->key.key_len == IPV4_KEYLEN)
> > +		IP_FRAG_LOG(DEBUG,
> > +			    "%s:%d:\n"
> > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> > +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > +			    tbl->use_entries, p, list_idx, list_cnt,
> > +			    p->key.src_dst[0], p->key.id, p->start);
> > +	else
> > +		IP_FRAG_LOG(DEBUG,
> > +			    "%s:%d:\n"
> > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> > +			    "key: <" IPv6_KEY_BYTES_FMT
> > +			    ", %#x>, start: %" PRIu64 "\n",
> > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > +			    tbl->use_entries, p, list_idx, list_cnt,
> > +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> > +			    p->start);
> > +}
> > +
> > +#if defined(RTE_ARCH_ARM64)
> > +static inline struct ip_frag_pkt *
> > +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> > *key, uint64_t tms,
> > +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > +	struct ip_frag_pkt *empty, *old;
> > +	struct ip_frag_pkt *p1, *p2;
> > +	uint32_t assoc, sig1, sig2;
> > +	uint64_t max_cycles;
> > +
> > +	empty = NULL;
> > +	old = NULL;
> > +
> > +	max_cycles = tbl->max_cycles;
> > +	assoc = tbl->bucket_entries;
> > +
> > +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> > +		return tbl->last;
> > +
> > +	/* different hashing methods for IPv4 and IPv6 */
> > +	if (key->key_len == IPV4_KEYLEN)
> > +		ipv4_frag_hash(key, &sig1, &sig2);
> > +	else
> > +		ipv6_frag_hash(key, &sig1, &sig2);
> > +
> > +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> > +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> > +
> > +	uint64x2_t key0, key1, key2, key3;
> > +	uint64_t vmask, zmask, ts_mask;
> > +	uint64x2_t ts0, ts1;
> > +	uint32x4_t nz_key;
> > +	uint8_t idx;
> > +	/* Bucket entries are always power of 2. */
> > +	rte_prefetch0(&p1[0].key);
> > +	rte_prefetch0(&p1[1].key);
> > +	rte_prefetch0(&p2[0].key);
> > +	rte_prefetch0(&p2[1].key);
> > +
> > +	while (assoc > 1) {
> > +		if (assoc > 2) {
> > +			rte_prefetch0(&p1[2].key);
> > +			rte_prefetch0(&p1[3].key);
> > +			rte_prefetch0(&p2[2].key);
> > +			rte_prefetch0(&p2[3].key);
> > +		}
> > +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> > +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> > +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> > +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> > +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> > +
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1),
> nz_key, 0);
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1),
> nz_key, 1);
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1),
> nz_key, 2);
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> > +1), nz_key, 3);
> > +
> > +		nz_key = vceqzq_u32(nz_key);
> > +		zmask =
> > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> > +		vmask = ~zmask;
> > +
> > +		vmask &= 0x8000800080008000;
> > +		for (; vmask > 0; vmask &= vmask - 1) {
> > +			idx = __builtin_ctzll(vmask) >> 4;
> > +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> > +				return p[idx];
> > +		}
> > +
> > +		vmask = ~zmask;
> > +		if (zmask && empty == NULL) {
> > +			zmask &= 0x8000800080008000;
> > +			idx = __builtin_ctzll(zmask) >> 4;
> > +			empty = p[idx];
> > +		}
> > +
> > +		if (vmask && old == NULL) {
> > +			const uint64x2_t max_cyc =
> > vdupq_n_u64(max_cycles);
> > +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> > +
> > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> > 0);
> > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> > 1);
> > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> > 0);
> > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> > 1);
> > +
> > +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> > +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> > +
> > +			ts_mask =
> > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> > +
> > 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> > +
> > vreinterpretq_u32_u64(ts1)),
> > +							16)),
> > +						0);
> > +			vmask &= 0x8000800080008000;
> > +			ts_mask &= vmask;
> > +			if (ts_mask) {
> > +				idx = __builtin_ctzll(ts_mask) >> 4;
> > +				old = p[idx];
> > +			}
> > +		}
> > +		p1 += 2;
> > +		p2 += 2;
> > +		assoc -= 4;
> > +	}
> > +	while (assoc) {
> > +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> > +			return p1;
> > +		else if (ip_frag_key_is_empty(&p1->key))
> > +			empty = (empty == NULL) ? p1 : empty;
> > +		else if (max_cycles + p1->start < tms)
> > +			old = (old == NULL) ? p1 : old;
> > +
> > +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> > +			return p2;
> > +		else if (ip_frag_key_is_empty(&p2->key))
> > +			empty = (empty == NULL) ? p2 : empty;
> > +		else if (max_cycles + p2->start < tms)
> > +			old = (old == NULL) ? p2 : old;
> > +		p1++;
> > +		p2++;
> > +		assoc--;
> > +	}
> > +
> > +	*free = empty;
> > +	*stale = old;
> > +	return NULL;
> > +}
> > +#endif
> > +
> > +static struct ip_frag_pkt *
> > +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct
> ip_frag_key
> > *key, uint64_t tms,
> > +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> >  {
> >  	struct ip_frag_pkt *p1, *p2;
> >  	struct ip_frag_pkt *empty, *old;
> > @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> >  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> >
> >  	for (i = 0; i != assoc; i++) {
> > -		if (p1->key.key_len == IPV4_KEYLEN)
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv4_frag_pkt line0: %p, index: %u
> > from %u\n"
> > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p1, i, assoc,
> > -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> > -		else
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv6_frag_pkt line0: %p, index: %u
> > from %u\n"
> > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p1, i, assoc,
> > -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> > p1[i].start);
> > -
> > +		ip_frag_dbg(tbl, &p1[i], i, assoc);
> >  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
> >  			return p1 + i;
> >  		else if (ip_frag_key_is_empty(&p1[i].key))
> > @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> >  		else if (max_cycles + p1[i].start < tms)
> >  			old = (old == NULL) ? (p1 + i) : old;
> >
> > -		if (p2->key.key_len == IPV4_KEYLEN)
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv4_frag_pkt line1: %p, index: %u
> > from %u\n"
> > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p2, i, assoc,
> > -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> > -		else
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv6_frag_pkt line1: %p, index: %u
> > from %u\n"
> > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p2, i, assoc,
> > -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> > p2[i].start);
> > -
> > +		ip_frag_dbg(tbl, &p2[i], i, assoc);
> >  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
> >  			return p2 + i;
> >  		else if (ip_frag_key_is_empty(&p2[i].key))
> > -			empty = (empty == NULL) ?( p2 + i) : empty;
> > +			empty = (empty == NULL) ? (p2 + i) : empty;
> >  		else if (max_cycles + p2[i].start < tms)
> >  			old = (old == NULL) ? (p2 + i) : old;
> >  	}
> > @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> >  	*stale = old;
> >  	return NULL;
> >  }
> > +
> > +struct ip_frag_pkt *
> > +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key,
> > uint64_t tms,
> > +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > +	switch (tbl->lookup_fn) {
> > +#if defined(RTE_ARCH_ARM64)
> > +	case REASSEMBLY_LOOKUP_NEON:
> > +		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
> #endif
> > +	case REASSEMBLY_LOOKUP_SCALAR:
> > +	default:
> > +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> > +	}
> > +}
> > diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index
> > ef9d8c0d75..049437ae32 100644
> > --- a/lib/ip_frag/ip_reassembly.h
> > +++ b/lib/ip_frag/ip_reassembly.h
> > @@ -12,6 +12,11 @@
> >
> >  #include <rte_ip_frag.h>
> >
> > +enum ip_frag_lookup_func {
> > +	REASSEMBLY_LOOKUP_SCALAR = 0,
> > +	REASSEMBLY_LOOKUP_NEON,
> > +};
> > +
> >  enum {
> >  	IP_LAST_FRAG_IDX,    /* index of last fragment */
> >  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> > @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
> >  	struct ip_frag_pkt *last;     /* last used entry. */
> >  	struct ip_pkt_list lru;       /* LRU list for table entries. */
> >  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> > +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup
> function.
> > */
> >  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> >
> > diff --git a/lib/ip_frag/rte_ip_frag_common.c
> > b/lib/ip_frag/rte_ip_frag_common.c
> > index c1de2e81b6..ef3c104e45 100644
> > --- a/lib/ip_frag/rte_ip_frag_common.c
> > +++ b/lib/ip_frag/rte_ip_frag_common.c
> > @@ -5,7 +5,9 @@
> >  #include <stddef.h>
> >  #include <stdio.h>
> >
> > +#include <rte_cpuflags.h>
> >  #include <rte_log.h>
> > +#include <rte_vect.h>
> >
> >  #include "ip_frag_common.h"
> >
> > @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> > uint32_t bucket_entries,
> >  	tbl->bucket_entries = bucket_entries;
> >  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> >
> > +#if defined(RTE_ARCH_ARM64)
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> > +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> > +	else
> > +#endif
> > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> > +
> >  	TAILQ_INIT(&(tbl->lru));
> >  	return tbl;
> >  }
> > --
> > 2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 17:58       ` Pavan Nikhilesh Bhagavatula
@ 2023-05-23 22:23         ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 28+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-05-23 22:23 UTC (permalink / raw)
  To: Honnappa Nagarahalli, Jerin Jacob Kollanukkaran, nd, Konstantin Ananyev
  Cc: dev, nd, nd



> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula
> Sent: Tuesday, May 23, 2023 11:29 PM
> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; nd <nd@arm.com>; Konstantin
> Ananyev <konstantin.v.ananyev@yandex.ru>
> Cc: dev@dpdk.org; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 2/3] ip_frag: improve reassembly lookup
> performance
> 
> > > -----Original Message-----
> > > From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> > > Sent: Tuesday, May 23, 2023 9:39 AM
> > > To: jerinj@marvell.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin
> > Ananyev
> > > <konstantin.v.ananyev@yandex.ru>
> > > Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup
> performance
> > >
> > > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > >
> > > Improve reassembly lookup performance by using NEON intrinsics for key
> > > validation.
> > What is the improvement do you see with this?
> 
> On Neoverse-N2 I see around improvement of 300-600c per flow and ~200c
> per insert.
> 

Below data is incorrect due to a bug (See below), but I still see improvement with ipv6.

> Here are some test results.
> 
> Without patch:
> +=========================================================
> =================================================+
> | IPV4                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 1244        | 919                    | 114               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1653        | 968                    | 128               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1379        | 503                    | 110               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 1613        | 520                    | 139               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 2030        | 199                    | 190               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 4393        | 309                    | 402               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 1531        | 333                    | 147               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 2771        | 357                    | 213
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 1228        | 920                    | 102               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 1197        | 905                    | 103               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 1183        | 904                    | 104               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 1153        | 921                    | 105               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 1123        | 911                    | 111               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 829         | 193                    | 690               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 830         | 195                    | 682               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 817         | 211                    | 690               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 819         | 195                    | 690               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 823         | 223                    | 676               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 1765        | 1038                   | 177               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2588        | 699                    | 190               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 5253        | 265                    | 403               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 3398        | 493                    | 301
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> +=========================================================
> =================================================+
> | IPV6                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 1838        | 1176                   | 136               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1892        | 1188                   | 160               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1986        | 628                    | 143               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 2670        | 646                    | 155               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 3152        | 261                    | 271               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 5127        | 324                    | 434               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 2169        | 427                    | 203               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 3382        | 452                    | 255
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 1837        | 1164                   | 124               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 1790        | 1158                   | 126               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 1807        | 1161                   | 138               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 1776        | 1160                   | 138               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 1715        | 1169                   | 144               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 1488        | 256                    | 1228              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 1461        | 300                    | 1205              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 1457        | 303                    | 1202              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 1456        | 305                    | 1201              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 1460        | 308                    | 1205              |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 2145        | 1330                   | 296               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2778        | 830                    | 330               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 5715        | 324                    | 444               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 3625        | 550                    | 363
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> With patch :
> 
> +=========================================================
> =================================================+
> | IPV4                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 950         | 717                    | 98                |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1013        | 706                    | 108               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1096        | 397                    | 115               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 1150        | 412                    | 128               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 1783        | 166                    | 202               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 3933        | 284                    | 424               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 1288        | 267                    | 159               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 2393        | 302                    | 235
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 956         | 703                    | 110               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 937         | 693                    | 112               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 912         | 670                    | 121               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 908         | 688                    | 122               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 894         | 688                    | 128               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 1019        | 179                    | 865               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 1052        | 176                    | 895               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 1130        | 180                    | 1003              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 1143        | 180                    | 1020              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 1130        | 181                    | 985               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 1582        | 710                    | 168               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2162        | 446                    | 194               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 4997        | 214                    | 426               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 2921        | 341                    | 311
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> +=========================================================
> =================================================+
> | IPV6                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 1275        | 687                    | 125               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1335        | 721                    | 169               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1388        | 415                    | 169               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 2117        | 393                    | 163               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 2811        | 172                    | 241               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 4322        | 227                    | 401               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 1730        | 270                    | 192               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 2839        | 317                    | 264
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 1152        | 662                    | 126               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 1107        | 658                    | 130               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 1190        | 647                    | 138               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 1086        | 635                    | 141               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 1064        | 645                    | 150               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 1560        | 172                    | 1296              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 1536        | 226                    | 1274              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 1543        | 228                    | 1282              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 1548        | 228                    | 1287              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 1541        | 227                    | 1280              |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 1585        | 769                    | 281               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2222        | 536                    | 327               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 4962        | 232                    | 439               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 2998        | 373                    | 360
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> >
> > >
> > > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > ---
> > >  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++-----
> -
> > >  lib/ip_frag/ip_reassembly.h      |   6 +
> > >  lib/ip_frag/rte_ip_frag_common.c |  10 ++
> > >  3 files changed, 196 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
> > index
> > > 7cbef647df..de78a0ed8f 100644
> > > --- a/lib/ip_frag/ip_frag_internal.c
> > > +++ b/lib/ip_frag/ip_frag_internal.c
> > > @@ -4,8 +4,9 @@
> > >
> > >  #include <stddef.h>
> > >
> > > -#include <rte_jhash.h>
> > >  #include <rte_hash_crc.h>
> > > +#include <rte_jhash.h>
> > > +#include <rte_vect.h>
> > >
> > >  #include "ip_frag_common.h"
> > >
> > > @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> > > rte_ip_frag_death_row *dr,
> > >  	return pkt;
> > >  }
> > >
> > > -struct ip_frag_pkt *
> > > -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > > -	const struct ip_frag_key *key, uint64_t tms,
> > > -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> > > +static inline void
> > > +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> > > +	    uint32_t list_idx, uint32_t list_cnt) {
> > > +	RTE_SET_USED(tbl);
> > > +	RTE_SET_USED(list_idx);
> > > +	RTE_SET_USED(list_cnt);
> > > +	if (p->key.key_len == IPV4_KEYLEN)
> > > +		IP_FRAG_LOG(DEBUG,
> > > +			    "%s:%d:\n"
> > > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > > +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> > > +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > > +			    tbl->use_entries, p, list_idx, list_cnt,
> > > +			    p->key.src_dst[0], p->key.id, p->start);
> > > +	else
> > > +		IP_FRAG_LOG(DEBUG,
> > > +			    "%s:%d:\n"
> > > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > > +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> > > +			    "key: <" IPv6_KEY_BYTES_FMT
> > > +			    ", %#x>, start: %" PRIu64 "\n",
> > > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > > +			    tbl->use_entries, p, list_idx, list_cnt,
> > > +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> > > +			    p->start);
> > > +}
> > > +
> > > +#if defined(RTE_ARCH_ARM64)
> > > +static inline struct ip_frag_pkt *
> > > +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct
> ip_frag_key
> > > *key, uint64_t tms,
> > > +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > > +	struct ip_frag_pkt *empty, *old;
> > > +	struct ip_frag_pkt *p1, *p2;
> > > +	uint32_t assoc, sig1, sig2;
> > > +	uint64_t max_cycles;
> > > +
> > > +	empty = NULL;
> > > +	old = NULL;
> > > +
> > > +	max_cycles = tbl->max_cycles;
> > > +	assoc = tbl->bucket_entries;
> > > +
> > > +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> > > +		return tbl->last;
> > > +
> > > +	/* different hashing methods for IPv4 and IPv6 */
> > > +	if (key->key_len == IPV4_KEYLEN)
> > > +		ipv4_frag_hash(key, &sig1, &sig2);
> > > +	else
> > > +		ipv6_frag_hash(key, &sig1, &sig2);
> > > +
> > > +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> > > +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> > > +
> > > +	uint64x2_t key0, key1, key2, key3;
> > > +	uint64_t vmask, zmask, ts_mask;
> > > +	uint64x2_t ts0, ts1;
> > > +	uint32x4_t nz_key;
> > > +	uint8_t idx;
> > > +	/* Bucket entries are always power of 2. */
> > > +	rte_prefetch0(&p1[0].key);
> > > +	rte_prefetch0(&p1[1].key);
> > > +	rte_prefetch0(&p2[0].key);
> > > +	rte_prefetch0(&p2[1].key);
> > > +
> > > +	while (assoc > 1) {
> > > +		if (assoc > 2) {
> > > +			rte_prefetch0(&p1[2].key);
> > > +			rte_prefetch0(&p1[3].key);
> > > +			rte_prefetch0(&p2[2].key);
> > > +			rte_prefetch0(&p2[3].key);
> > > +		}
> > > +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> > > +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> > > +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> > > +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> > > +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> > > +
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1),
> > nz_key, 0);
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1),
> > nz_key, 1);
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1),
> > nz_key, 2);
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> > > +1), nz_key, 3);
> > > +

I think we can compare id part too since its already in the vector register, I will rewrite this part.

> > > +		nz_key = vceqzq_u32(nz_key);
> > > +		zmask =
> > > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> > > +		vmask = ~zmask;
> > > +
> > > +		vmask &= 0x8000800080008000;
> > > +		for (; vmask > 0; vmask &= vmask - 1) {
> > > +			idx = __builtin_ctzll(vmask) >> 4;
> > > +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> > > +				return p[idx];
> > > +		}
> > > +
> > > +		vmask = ~zmask;
> > > +		if (zmask && empty == NULL) {
> > > +			zmask &= 0x8000800080008000;
> > > +			idx = __builtin_ctzll(zmask) >> 4;
> > > +			empty = p[idx];
> > > +		}
> > > +
> > > +		if (vmask && old == NULL) {
> > > +			const uint64x2_t max_cyc =
> > > vdupq_n_u64(max_cycles);
> > > +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> > > +
> > > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> > > 0);
> > > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> > > 1);
> > > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> > > 0);
> > > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> > > 1);
> > > +
> > > +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> > > +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> > > +
> > > +			ts_mask =
> > > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> > > +
> > > 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> > > +
> > > vreinterpretq_u32_u64(ts1)),
> > > +							16)),
> > > +						0);
> > > +			vmask &= 0x8000800080008000;
> > > +			ts_mask &= vmask;
> > > +			if (ts_mask) {
> > > +				idx = __builtin_ctzll(ts_mask) >> 4;
> > > +				old = p[idx];
> > > +			}
> > > +		}
> > > +		p1 += 2;
> > > +		p2 += 2;
> > > +		assoc -= 4;

Should be -=2

> > > +	}
> > > +	while (assoc) {
> > > +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> > > +			return p1;
> > > +		else if (ip_frag_key_is_empty(&p1->key))
> > > +			empty = (empty == NULL) ? p1 : empty;
> > > +		else if (max_cycles + p1->start < tms)
> > > +			old = (old == NULL) ? p1 : old;
> > > +
> > > +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> > > +			return p2;
> > > +		else if (ip_frag_key_is_empty(&p2->key))
> > > +			empty = (empty == NULL) ? p2 : empty;
> > > +		else if (max_cycles + p2->start < tms)
> > > +			old = (old == NULL) ? p2 : old;
> > > +		p1++;
> > > +		p2++;
> > > +		assoc--;
> > > +	}
> > > +
> > > +	*free = empty;
> > > +	*stale = old;
> > > +	return NULL;
> > > +}
> > > +#endif
> > > +
> > > +static struct ip_frag_pkt *
> > > +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct
> > ip_frag_key
> > > *key, uint64_t tms,
> > > +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> > >  {
> > >  	struct ip_frag_pkt *p1, *p2;
> > >  	struct ip_frag_pkt *empty, *old;
> > > @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > >  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> > >
> > >  	for (i = 0; i != assoc; i++) {
> > > -		if (p1->key.key_len == IPV4_KEYLEN)
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv4_frag_pkt line0: %p, index: %u
> > > from %u\n"
> > > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p1, i, assoc,
> > > -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> > > -		else
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv6_frag_pkt line0: %p, index: %u
> > > from %u\n"
> > > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > > PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p1, i, assoc,
> > > -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> > > p1[i].start);
> > > -
> > > +		ip_frag_dbg(tbl, &p1[i], i, assoc);
> > >  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
> > >  			return p1 + i;
> > >  		else if (ip_frag_key_is_empty(&p1[i].key))
> > > @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > >  		else if (max_cycles + p1[i].start < tms)
> > >  			old = (old == NULL) ? (p1 + i) : old;
> > >
> > > -		if (p2->key.key_len == IPV4_KEYLEN)
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv4_frag_pkt line1: %p, index: %u
> > > from %u\n"
> > > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p2, i, assoc,
> > > -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> > > -		else
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv6_frag_pkt line1: %p, index: %u
> > > from %u\n"
> > > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > > PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p2, i, assoc,
> > > -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> > > p2[i].start);
> > > -
> > > +		ip_frag_dbg(tbl, &p2[i], i, assoc);
> > >  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
> > >  			return p2 + i;
> > >  		else if (ip_frag_key_is_empty(&p2[i].key))
> > > -			empty = (empty == NULL) ?( p2 + i) : empty;
> > > +			empty = (empty == NULL) ? (p2 + i) : empty;
> > >  		else if (max_cycles + p2[i].start < tms)
> > >  			old = (old == NULL) ? (p2 + i) : old;
> > >  	}
> > > @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > >  	*stale = old;
> > >  	return NULL;
> > >  }
> > > +
> > > +struct ip_frag_pkt *
> > > +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key,
> > > uint64_t tms,
> > > +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > > +	switch (tbl->lookup_fn) {
> > > +#if defined(RTE_ARCH_ARM64)
> > > +	case REASSEMBLY_LOOKUP_NEON:
> > > +		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
> > #endif
> > > +	case REASSEMBLY_LOOKUP_SCALAR:
> > > +	default:
> > > +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> > > +	}
> > > +}
> > > diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
> index
> > > ef9d8c0d75..049437ae32 100644
> > > --- a/lib/ip_frag/ip_reassembly.h
> > > +++ b/lib/ip_frag/ip_reassembly.h
> > > @@ -12,6 +12,11 @@
> > >
> > >  #include <rte_ip_frag.h>
> > >
> > > +enum ip_frag_lookup_func {
> > > +	REASSEMBLY_LOOKUP_SCALAR = 0,
> > > +	REASSEMBLY_LOOKUP_NEON,
> > > +};
> > > +
> > >  enum {
> > >  	IP_LAST_FRAG_IDX,    /* index of last fragment */
> > >  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> > > @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
> > >  	struct ip_frag_pkt *last;     /* last used entry. */
> > >  	struct ip_pkt_list lru;       /* LRU list for table entries. */
> > >  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> > > +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup
> > function.
> > > */
> > >  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> > >
> > > diff --git a/lib/ip_frag/rte_ip_frag_common.c
> > > b/lib/ip_frag/rte_ip_frag_common.c
> > > index c1de2e81b6..ef3c104e45 100644
> > > --- a/lib/ip_frag/rte_ip_frag_common.c
> > > +++ b/lib/ip_frag/rte_ip_frag_common.c
> > > @@ -5,7 +5,9 @@
> > >  #include <stddef.h>
> > >  #include <stdio.h>
> > >
> > > +#include <rte_cpuflags.h>
> > >  #include <rte_log.h>
> > > +#include <rte_vect.h>
> > >
> > >  #include "ip_frag_common.h"
> > >
> > > @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> > > uint32_t bucket_entries,
> > >  	tbl->bucket_entries = bucket_entries;
> > >  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> > >
> > > +#if defined(RTE_ARCH_ARM64)
> > > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> > > +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> > > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> > > +	else
> > > +#endif
> > > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> > > +
> > >  	TAILQ_INIT(&(tbl->lru));
> > >  	return tbl;
> > >  }
> > > --
> > > 2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
  2023-05-23 16:22     ` Honnappa Nagarahalli
@ 2023-05-23 22:30     ` Stephen Hemminger
  2023-05-29 13:17       ` [EXT] " Pavan Nikhilesh Bhagavatula
  1 sibling, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2023-05-23 22:30 UTC (permalink / raw)
  To: pbhagavatula; +Cc: jerinj, Honnappa.Nagarahalli, nd, Konstantin Ananyev, dev

On Tue, 23 May 2023 20:09:20 +0530
<pbhagavatula@marvell.com> wrote:

> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Improve reassembly lookup performance by using NEON intrinsics for
> key validation.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
>  lib/ip_frag/ip_reassembly.h      |   6 +
>  lib/ip_frag/rte_ip_frag_common.c |  10 ++
>  3 files changed, 196 insertions(+), 44 deletions(-)


Using a function pointer for the lookup has some downsides.
On Intel an indirect call is slower especially with SPECTRE mitigations.

The bigger issue is that indirect call will break usage from primary/secondary
process with ASLR. If primary sets up table and secondary uses it the function
will be in different places.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXT] Re: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
  2023-05-23 22:30     ` Stephen Hemminger
@ 2023-05-29 13:17       ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 28+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-05-29 13:17 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jerin Jacob Kollanukkaran, Honnappa.Nagarahalli, nd,
	Konstantin Ananyev, dev


> On Tue, 23 May 2023 20:09:20 +0530
> <pbhagavatula@marvell.com> wrote:
> 
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Improve reassembly lookup performance by using NEON intrinsics for
> > key validation.
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
> >  lib/ip_frag/ip_reassembly.h      |   6 +
> >  lib/ip_frag/rte_ip_frag_common.c |  10 ++
> >  3 files changed, 196 insertions(+), 44 deletions(-)
> 
> 
> Using a function pointer for the lookup has some downsides.
> On Intel an indirect call is slower especially with SPECTRE mitigations.
> 

The patch doesn't use direct function pointers, it stores the function id and switches between them.
Function ID scheme doesn't break primary/secondary process scheme even with ASLR scheme.

> The bigger issue is that indirect call will break usage from primary/secondary
> process with ASLR. If primary sets up table and secondary uses it the function
> will be in different places.

I will be dropping this patch since the performance improvement with NEON is negligible, there is lot of bucket 
state that we unfortunately don't cache with the current implementation.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 1/2] ip_frag: optimize key compare and hash generation
  2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
  2023-05-23 14:39   ` [PATCH v2 3/3] test: add reassembly perf test pbhagavatula
@ 2023-05-29 14:55   ` pbhagavatula
  2023-05-29 14:55     ` [PATCH v3 2/2] test: add reassembly perf test pbhagavatula
                       ` (3 more replies)
  2 siblings, 4 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-29 14:55 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use optimized rte_hash_k32_cmp_eq routine for key comparison for
x86 and ARM64.
Use CRC instructions for hash generation on ARM64.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
On Neoverse-N2, performance improved by 10% when measured with
examples/ip_reassembly.

 v3 Changes:
 - Drop NEON patch.
 v2 Changes:
 - Fix compilation failure with non ARM64/x86 targets

 lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
 lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
 lib/ip_frag/ip_frag_common.h   | 14 +++++++++++++-
 lib/ip_frag/ip_frag_internal.c |  4 ++--
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
index e9e26f9abd..a3e85635eb 100644
--- a/lib/hash/rte_cmp_arm64.h
+++ b/lib/hash/rte_cmp_arm64.h
@@ -3,7 +3,7 @@
  */

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 		    size_t key_len __rte_unused)
 {
@@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 	return !(x0 == 0 && x1 == 0);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
index 13a5836351..ddfbef462f 100644
--- a/lib/hash/rte_cmp_x86.h
+++ b/lib/hash/rte_cmp_x86.h
@@ -5,7 +5,7 @@
 #include <rte_vect.h>

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
 {
 	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
@@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
 	return !_mm_test_all_zeros(x, x);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
index 0d8ce6a1e1..5cdd98c8fe 100644
--- a/lib/ip_frag/ip_frag_common.h
+++ b/lib/ip_frag/ip_frag_common.h
@@ -5,7 +5,13 @@
 #ifndef _IP_FRAG_COMMON_H_
 #define _IP_FRAG_COMMON_H_

-#include <sys/queue.h>
+#include <rte_common.h>
+
+#if defined(RTE_ARCH_ARM64)
+#include <rte_cmp_arm64.h>
+#elif defined(RTE_ARCH_X86)
+#include <rte_cmp_x86.h>
+#endif

 #include "rte_ip_frag.h"
 #include "ip_reassembly.h"
@@ -75,12 +81,18 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
 static inline uint64_t
 ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
 {
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
+	return (k1->id_key_len != k2->id_key_len) ||
+	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
+					     rte_hash_k32_cmp_eq(k1, k2, 32));
+#else
 	uint32_t i;
 	uint64_t val;
 	val = k1->id_key_len ^ k2->id_key_len;
 	for (i = 0; i < k1->key_len; i++)
 		val |= k1->src_dst[i] ^ k2->src_dst[i];
 	return val;
+#endif
 }

 /*
diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index b436a4c931..7cbef647df 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *)&key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(key->id, v);
@@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *) &key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(p[2], v);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 2/2] test: add reassembly perf test
  2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
@ 2023-05-29 14:55     ` pbhagavatula
  2023-05-30 10:51       ` [EXT] " Amit Prakash Shukla
  2023-05-30  3:09     ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation Stephen Hemminger
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 28+ messages in thread
From: pbhagavatula @ 2023-05-29 14:55 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
Each test is performed with variable number of fragments per flow,
either ordered or unordered fragments and interleaved flows.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test/meson.build            |    2 +
 app/test/test_reassembly_perf.c | 1001 +++++++++++++++++++++++++++++++
 2 files changed, 1003 insertions(+)
 create mode 100644 app/test/test_reassembly_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index d96ae7a961..70f320f388 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -108,6 +108,7 @@ test_sources = files(
         'test_rawdev.c',
         'test_rcu_qsbr.c',
         'test_rcu_qsbr_perf.c',
+        'test_reassembly_perf.c',
         'test_reciprocal_division.c',
         'test_reciprocal_division_perf.c',
         'test_red.c',
@@ -297,6 +298,7 @@ perf_test_names = [
         'trace_perf_autotest',
         'ipsec_perf_autotest',
         'thash_perf_autotest',
+        'reassembly_perf_autotest',
 ]

 driver_test_names = [
diff --git a/app/test/test_reassembly_perf.c b/app/test/test_reassembly_perf.c
new file mode 100644
index 0000000000..850485a9c5
--- /dev/null
+++ b/app/test/test_reassembly_perf.c
@@ -0,0 +1,1001 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell.
+ */
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_hexdump.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_random.h>
+#include <rte_udp.h>
+
+#include "test.h"
+
+#define MAX_FLOWS	    (1024 * 32)
+#define MAX_BKTS	    MAX_FLOWS
+#define MAX_ENTRIES_PER_BKT 16
+#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define MIN_FRAGMENTS	    2
+#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
+
+#define MAX_PKT_LEN 2048
+#define MAX_TTL_MS  (5 * MS_PER_S)
+
+/* use RFC863 Discard Protocol */
+#define UDP_SRC_PORT 9
+#define UDP_DST_PORT 9
+
+/* use RFC5735 / RFC2544 reserved network test addresses */
+#define IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x))
+#define IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 8) | (x))
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#define IP6_VERSION 6
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+
+static struct rte_ip_frag_tbl *frag_tbl;
+static struct rte_mempool *pkt_pool;
+static struct rte_mbuf *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
+static uint8_t frag_per_flow[MAX_FLOWS];
+static uint32_t flow_cnt;
+
+#define FILL_MODE_LINEAR      0
+#define FILL_MODE_RANDOM      1
+#define FILL_MODE_INTERLEAVED 2
+
+static int
+reassembly_test_setup(void)
+{
+	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
+
+	frag_tbl = rte_ip_frag_table_create(MAX_FLOWS, MAX_ENTRIES_PER_BKT,
+					    MAX_FLOWS * MAX_ENTRIES_PER_BKT,
+					    max_ttl_cyc, rte_socket_id());
+	if (frag_tbl == NULL)
+		return TEST_FAILED;
+
+	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
+	pkt_pool = rte_pktmbuf_pool_create(
+		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS, 0, 0,
+		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
+	if (pkt_pool == NULL) {
+		printf("[%s] Failed to create pkt pool\n", __func__);
+		rte_ip_frag_table_destroy(frag_tbl);
+		return TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static void
+reassembly_test_teardown(void)
+{
+	if (frag_tbl != NULL)
+		rte_ip_frag_table_destroy(frag_tbl);
+
+	if (pkt_pool != NULL)
+		rte_mempool_free(pkt_pool);
+}
+
+static void
+randomize_array_positions(void **array, uint8_t sz)
+{
+	void *tmp;
+	int i, j;
+
+	if (sz == 2) {
+		tmp = array[0];
+		array[0] = array[1];
+		array[1] = tmp;
+	} else {
+		for (i = sz - 1; i > 0; i--) {
+			j = rte_rand_max(i + 1);
+			tmp = array[i];
+			array[i] = array[j];
+			array[j] = tmp;
+		}
+	}
+}
+
+static void
+reassembly_print_banner(const char *proto_str)
+{
+	printf("+=============================================================="
+	       "============================================+\n");
+	printf("| %-32s| %-3s : %-58d|\n", proto_str, "Flow Count", MAX_FLOWS);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+	printf("%-17s%-17s%-14s%-14s%-25s%-20s\n", "| Fragment Order",
+	       "| Fragments/Flow", "| Outstanding", "| Cycles/Flow",
+	       "| Cycles/Fragment insert", "| Cycles/Reassembly |");
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint32_t ip_cksum;
+		uint16_t pkt_len;
+		uint16_t *ptr16;
+
+		frag_offset = i * (frag_len / 8);
+
+		if (i == nb_frags - 1)
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+		else
+			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv4_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
+		ip_hdr->time_to_live = IP_DEFTTL;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id =
+			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
+		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
+		ip_hdr->src_addr = rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
+		ip_hdr->dst_addr = rte_cpu_to_be_32(IP_DST_ADDR(flow_id));
+
+		/*
+		 * Compute IP header checksum.
+		 */
+		ptr16 = (unaligned_uint16_t *)ip_hdr;
+		ip_cksum = 0;
+		ip_cksum += ptr16[0];
+		ip_cksum += ptr16[1];
+		ip_cksum += ptr16[2];
+		ip_cksum += ptr16[3];
+		ip_cksum += ptr16[4];
+		ip_cksum += ptr16[6];
+		ip_cksum += ptr16[7];
+		ip_cksum += ptr16[8];
+		ip_cksum += ptr16[9];
+
+		/*
+		 * Reduce 32 bit checksum to 16 bits and complement it.
+		 */
+		ip_cksum = ((ip_cksum & 0xFFFF0000) >> 16) +
+			   (ip_cksum & 0x0000FFFF);
+		if (ip_cksum > 65535)
+			ip_cksum -= 65535;
+		ip_cksum = (~ip_cksum) & 0x0000FFFF;
+		if (ip_cksum == 0)
+			ip_cksum = 0xFFFF;
+		ip_hdr->hdr_checksum = (uint16_t)ip_cksum;
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len = sizeof(struct rte_ipv4_hdr);
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static uint8_t
+get_rand_frags(uint8_t max_frag)
+{
+	uint8_t frags = rte_rand_max(max_frag + 1);
+
+	return frags <= 1 ? MIN_FRAGMENTS : frags;
+}
+
+static int
+ipv4_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+ipv6_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct ipv6_extension_fragment *frag_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint16_t pkt_len;
+
+		frag_offset = i * (frag_len / 8);
+		frag_offset <<= 3;
+		if (i == nb_frags - 1) {
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 0);
+		} else {
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 1);
+		}
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv6_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr) +
+				RTE_IPV6_FRAG_HDR_SIZE);
+		frag_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct ipv6_extension_fragment *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv6_hdr) +
+				     RTE_IPV6_FRAG_HDR_SIZE);
+		ip_hdr->vtc_flow = rte_cpu_to_be_32(IP6_VERSION << 28);
+		ip_hdr->payload_len =
+			rte_cpu_to_be_16(pkt_len - sizeof(struct rte_ipv6_hdr));
+		ip_hdr->proto = IPPROTO_FRAGMENT;
+		ip_hdr->hop_limits = IP_DEFTTL;
+		memcpy(ip_hdr->src_addr, ip6_addr, sizeof(ip_hdr->src_addr));
+		memcpy(ip_hdr->dst_addr, ip6_addr, sizeof(ip_hdr->dst_addr));
+		ip_hdr->src_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->src_addr[7] |= 0x10;
+		ip_hdr->src_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->src_addr[9] = flow_id & 0xff;
+
+		ip_hdr->dst_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->dst_addr[7] |= 0x20;
+		ip_hdr->dst_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->dst_addr[9] = flow_id & 0xff;
+
+		frag_hdr->next_header = IPPROTO_UDP;
+		frag_hdr->reserved = 0;
+		frag_hdr->frag_data = rte_cpu_to_be_16(frag_offset);
+		frag_hdr->id = rte_cpu_to_be_32(flow_id + 1);
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len =
+			sizeof(struct rte_ipv6_hdr) + RTE_IPV6_FRAG_HDR_SIZE;
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static int
+ipv6_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+frag_pkt_teardown(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < flow_cnt; i++)
+		rte_pktmbuf_free(mbufs[i][0]);
+}
+
+static void
+reassembly_print_stats(int8_t nb_frags, uint8_t fill_order,
+		       uint32_t outstanding, uint64_t cyc_per_flow,
+		       uint64_t cyc_per_frag_insert,
+		       uint64_t cyc_per_reassembly)
+{
+	char frag_str[8], order_str[12];
+
+	if (nb_frags > 0)
+		snprintf(frag_str, sizeof(frag_str), "%d", nb_frags);
+	else
+		snprintf(frag_str, sizeof(frag_str), "RANDOM");
+
+	switch (fill_order) {
+	case FILL_MODE_LINEAR:
+		snprintf(order_str, sizeof(order_str), "LINEAR");
+		break;
+	case FILL_MODE_RANDOM:
+		snprintf(order_str, sizeof(order_str), "RANDOM");
+		break;
+	case FILL_MODE_INTERLEAVED:
+		snprintf(order_str, sizeof(order_str), "INTERLEAVED");
+		break;
+	default:
+		break;
+	}
+
+	printf("| %-14s | %-14s | %-11d | %-11" PRIu64 " | %-22" PRIu64
+	       " | %-17" PRIu64 " |\n",
+	       order_str, frag_str, outstanding, cyc_per_flow,
+	       cyc_per_frag_insert, cyc_per_reassembly);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+join_array(struct rte_mbuf **dest_arr, struct rte_mbuf **src_arr,
+	   uint8_t offset, uint8_t sz)
+{
+	int i, j;
+
+	for (i = offset, j = 0; j < sz; i++, j++)
+		dest_arr[i] = src_arr[j];
+}
+
+static int
+ipv4_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_interleaved_flows_perf(uint8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_interleaved_flows_perf(int8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv4_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv4_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv4_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv4_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv4_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+ipv6_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv6_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv6_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv6_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv6_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv6_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+test_reassembly_perf(void)
+{
+	int8_t nb_fragments[] = {2, 3, MAX_FRAGMENTS, -1 /* Random */};
+	uint8_t order_type[] = {FILL_MODE_LINEAR, FILL_MODE_RANDOM};
+	uint32_t outstanding[] = {100, 500, 1000, 2000, 3000};
+	uint32_t i, j;
+	int rc;
+
+	rc = reassembly_test_setup();
+	if (rc)
+		return rc;
+
+	reassembly_print_banner("IPV4");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv4_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv4_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	printf("\n");
+	reassembly_print_banner("IPV6");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv6_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv6_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	reassembly_test_teardown();
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(reassembly_perf_autotest, test_reassembly_perf);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/2] ip_frag: optimize key compare and hash generation
  2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-05-29 14:55     ` [PATCH v3 2/2] test: add reassembly perf test pbhagavatula
@ 2023-05-30  3:09     ` Stephen Hemminger
  2023-05-30 17:50       ` [EXT] " Pavan Nikhilesh Bhagavatula
  2023-05-30  7:44     ` Ruifeng Wang
  2023-05-31  4:26     ` [PATCH v4 " pbhagavatula
  3 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2023-05-30  3:09 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Ruifeng Wang, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin, Konstantin Ananyev, dev

On Mon, 29 May 2023 20:25:01 +0530
<pbhagavatula@marvell.com> wrote:

> +	return (k1->id_key_len != k2->id_key_len) ||
> +	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
> +					     rte_hash_k32_cmp_eq(k1, k2, 32));

If you make another version, one small comment.
Breaking this into a couple of if statements would make reading easier
for human readers. Compiler doesn't care.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v3 1/2] ip_frag: optimize key compare and hash generation
  2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-05-29 14:55     ` [PATCH v3 2/2] test: add reassembly perf test pbhagavatula
  2023-05-30  3:09     ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation Stephen Hemminger
@ 2023-05-30  7:44     ` Ruifeng Wang
  2023-05-31  4:26     ` [PATCH v4 " pbhagavatula
  3 siblings, 0 replies; 28+ messages in thread
From: Ruifeng Wang @ 2023-05-30  7:44 UTC (permalink / raw)
  To: pbhagavatula, jerinj, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin, Konstantin Ananyev
  Cc: dev, nd

> -----Original Message-----
> From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> Sent: Monday, May 29, 2023 10:55 PM
> To: jerinj@marvell.com; Ruifeng Wang <Ruifeng.Wang@arm.com>; Yipeng Wang
> <yipeng1.wang@intel.com>; Sameh Gobriel <sameh.gobriel@intel.com>; Bruce Richardson
> <bruce.richardson@intel.com>; Vladimir Medvedkin <vladimir.medvedkin@intel.com>;
> Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> Subject: [PATCH v3 1/2] ip_frag: optimize key compare and hash generation
> 
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Use optimized rte_hash_k32_cmp_eq routine for key comparison for
> x86 and ARM64.
> Use CRC instructions for hash generation on ARM64.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
> On Neoverse-N2, performance improved by 10% when measured with examples/ip_reassembly.
> 
>  v3 Changes:
>  - Drop NEON patch.
>  v2 Changes:
>  - Fix compilation failure with non ARM64/x86 targets
> 
>  lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
>  lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
>  lib/ip_frag/ip_frag_common.h   | 14 +++++++++++++-
>  lib/ip_frag/ip_frag_internal.c |  4 ++--
>  4 files changed, 31 insertions(+), 19 deletions(-)
> 
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXT] [PATCH v3 2/2] test: add reassembly perf test
  2023-05-29 14:55     ` [PATCH v3 2/2] test: add reassembly perf test pbhagavatula
@ 2023-05-30 10:51       ` Amit Prakash Shukla
  0 siblings, 0 replies; 28+ messages in thread
From: Amit Prakash Shukla @ 2023-05-30 10:51 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula



> -----Original Message-----
> From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> Sent: Monday, May 29, 2023 8:25 PM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Cc: dev@dpdk.org; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>
> Subject: [EXT] [PATCH v3 2/2] test: add reassembly perf test
> 
> External Email
> 
> ----------------------------------------------------------------------
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
> Each test is performed with variable number of fragments per flow, either
> ordered or unordered fragments and interleaved flows.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  app/test/meson.build            |    2 +
>  app/test/test_reassembly_perf.c | 1001
> +++++++++++++++++++++++++++++++
>  2 files changed, 1003 insertions(+)
>  create mode 100644 app/test/test_reassembly_perf.c
> 
> diff --git a/app/test/meson.build b/app/test/meson.build index
> d96ae7a961..70f320f388 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -108,6 +108,7 @@ test_sources = files(
>          'test_rawdev.c',
>          'test_rcu_qsbr.c',
>          'test_rcu_qsbr_perf.c',
> +        'test_reassembly_perf.c',
>          'test_reciprocal_division.c',
>          'test_reciprocal_division_perf.c',
>          'test_red.c',
> @@ -297,6 +298,7 @@ perf_test_names = [
>          'trace_perf_autotest',
>          'ipsec_perf_autotest',
>          'thash_perf_autotest',
> +        'reassembly_perf_autotest',
>  ]
> 
>  driver_test_names = [
> diff --git a/app/test/test_reassembly_perf.c
> b/app/test/test_reassembly_perf.c new file mode 100644 index
> 0000000000..850485a9c5
> --- /dev/null
> +++ b/app/test/test_reassembly_perf.c
> @@ -0,0 +1,1001 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Marvell.
> + */
> +
> +#include <rte_byteorder.h>
> +#include <rte_common.h>
> +#include <rte_cycles.h>
> +#include <rte_ether.h>
> +#include <rte_hexdump.h>
> +#include <rte_ip.h>
> +#include <rte_ip_frag.h>
> +#include <rte_mbuf.h>
> +#include <rte_mbuf_pool_ops.h>
> +#include <rte_random.h>
> +#include <rte_udp.h>
> +
> +#include "test.h"
> +
> +#define MAX_FLOWS	    (1024 * 32)
> +#define MAX_BKTS	    MAX_FLOWS
> +#define MAX_ENTRIES_PER_BKT 16
> +#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
> +#define MIN_FRAGMENTS	    2
> +#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
> +
> +#define MAX_PKT_LEN 2048
> +#define MAX_TTL_MS  (5 * MS_PER_S)
> +
> +/* use RFC863 Discard Protocol */
> +#define UDP_SRC_PORT 9
> +#define UDP_DST_PORT 9
> +
> +/* use RFC5735 / RFC2544 reserved network test addresses */ #define
> +IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x)) #define
> +IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 8) | (x))
> +
> +/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking
> +(RFC5180) */ static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0}; #define IP6_VERSION 6
> +
> +#define IP_DEFTTL 64 /* from RFC 1340. */
> +
> +static struct rte_ip_frag_tbl *frag_tbl; static struct rte_mempool
> +*pkt_pool; static struct rte_mbuf
> *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
> +static uint8_t frag_per_flow[MAX_FLOWS]; static uint32_t flow_cnt;
> +
> +#define FILL_MODE_LINEAR      0
> +#define FILL_MODE_RANDOM      1
> +#define FILL_MODE_INTERLEAVED 2
> +
> +static int
> +reassembly_test_setup(void)
> +{
> +	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
> +
> +	frag_tbl = rte_ip_frag_table_create(MAX_FLOWS,

I see MAX_BKTS and MAX_FLOWS are same in this application. Just for code readability please use MAX_BKTS.

> MAX_ENTRIES_PER_BKT,
> +					    MAX_FLOWS *
> MAX_ENTRIES_PER_BKT,
> +					    max_ttl_cyc, rte_socket_id());
> +	if (frag_tbl == NULL)
> +		return TEST_FAILED;
> +
> +	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
> +	pkt_pool = rte_pktmbuf_pool_create(
> +		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS,
> 0, 0,
> +		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
> +	if (pkt_pool == NULL) {
> +		printf("[%s] Failed to create pkt pool\n", __func__);
> +		rte_ip_frag_table_destroy(frag_tbl);
> +		return TEST_FAILED;
> +	}
> +
> +	return TEST_SUCCESS;
> +}
> +
> +static void
> +reassembly_test_teardown(void)
> +{
> +	if (frag_tbl != NULL)
> +		rte_ip_frag_table_destroy(frag_tbl);
> +
> +	if (pkt_pool != NULL)
> +		rte_mempool_free(pkt_pool);
> +}
> +

<snip>

> +static void
> +ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t
> flow_id,
> +		    uint8_t fill_mode)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv4_hdr *ip_hdr;
> +	struct rte_udp_hdr *udp_hdr;
> +	uint16_t frag_len;
> +	uint8_t i;
> +
> +	frag_len = MAX_PKT_LEN / nb_frags;
> +	if (frag_len % 8)
> +		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
> +
> +	for (i = 0; i < nb_frags; i++) {
> +		struct rte_mbuf *frag = mbuf[i];
> +		uint16_t frag_offset = 0;
> +		uint32_t ip_cksum;
> +		uint16_t pkt_len;
> +		uint16_t *ptr16;
> +
> +		frag_offset = i * (frag_len / 8);
> +
> +		if (i == nb_frags - 1)
> +			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags -
> 1));
> +		else
> +			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
> +
> +		rte_pktmbuf_reset_headroom(frag);
> +		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
> +		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr
> *,
> +						 sizeof(struct
> rte_ether_hdr));
> +		udp_hdr = rte_pktmbuf_mtod_offset(
> +			frag, struct rte_udp_hdr *,
> +			sizeof(struct rte_ether_hdr) +
> +				sizeof(struct rte_ipv4_hdr));
> +
> +		rte_ether_unformat_addr("02:00:00:00:00:01",
> +					&eth_hdr->dst_addr);
> +		rte_ether_unformat_addr("02:00:00:00:00:00",
> +					&eth_hdr->src_addr);
> +		eth_hdr->ether_type =
> rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
> +
> +		pkt_len = frag_len;
> +		/*
> +		 * Initialize UDP header.
> +		 */
> +		if (i == 0) {
> +			udp_hdr->src_port =
> rte_cpu_to_be_16(UDP_SRC_PORT);
> +			udp_hdr->dst_port =
> rte_cpu_to_be_16(UDP_DST_PORT);
> +			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
> +			udp_hdr->dgram_cksum = 0; /* No UDP checksum.
> */
> +		}
> +
> +		/*
> +		 * Initialize IP header.
> +		 */
> +		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
> +		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
> +		ip_hdr->type_of_service = 0;
> +		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
> +		ip_hdr->time_to_live = IP_DEFTTL;
> +		ip_hdr->next_proto_id = IPPROTO_UDP;
> +		ip_hdr->packet_id =
> +			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
> +		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
> +		ip_hdr->src_addr =
> rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
> +		ip_hdr->dst_addr =
> rte_cpu_to_be_32(IP_DST_ADDR(flow_id));

Flow_id is 32 bit and max number of flows for this application are 32768. Using the flow-id directly for
First octet will overwrite even the subsequent octect. It is fine for this test as benchmark testing subnet
Is 198.18.0.0/15 and with 32k flows it is not beaching the network part of the ip-address, but a comment
Will help if anyone tries to increase number of flows in future.

> +
> +		/*
> +		 * Compute IP header checksum.
> +		 */
> +		ptr16 = (unaligned_uint16_t *)ip_hdr;
> +		ip_cksum = 0;
> +		ip_cksum += ptr16[0];
> +		ip_cksum += ptr16[1];
> +		ip_cksum += ptr16[2];
> +		ip_cksum += ptr16[3];
> +		ip_cksum += ptr16[4];
> +		ip_cksum += ptr16[6];
> +		ip_cksum += ptr16[7];
> +		ip_cksum += ptr16[8];
> +		ip_cksum += ptr16[9];

Reviewed-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Tested-by: Amit Prakash Shukla <amitprakashs@marvell.com>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [EXT] Re: [PATCH v3 1/2] ip_frag: optimize key compare and hash generation
  2023-05-30  3:09     ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation Stephen Hemminger
@ 2023-05-30 17:50       ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 28+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2023-05-30 17:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jerin Jacob Kollanukkaran, Ruifeng Wang, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin,
	Konstantin Ananyev, dev

> On Mon, 29 May 2023 20:25:01 +0530
> <pbhagavatula@marvell.com> wrote:
> 
> > +	return (k1->id_key_len != k2->id_key_len) ||
> > +	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
> > +					     rte_hash_k32_cmp_eq(k1, k2,
> 32));
> 
> If you make another version, one small comment.
> Breaking this into a couple of if statements would make reading easier
> for human readers. Compiler doesn't care.

I have modified the above code to 

       if (k1->id_key_len != k2->id_key_len)
               return 1;
       if (k1->key_len == IPV4_KEYLEN)
               return k1->src_dst[0] != k2->src_dst[0];
       else
               return rte_hash_k32_cmp_eq(k1, k2, 32);

But upon remeasuring performance I see a performance loss of 1.2%
Compiler(GCC 10) generates additional branches with the above code.

I have also profiled the ip_reassembly application with and without the changes and see lot of 
additional branch misses


Current implementation:

==============
Branch Metrics
==============
Branch MPKI                                                  : 0.159             
Branch PKI                                                   : 156.566           
Branch Mis-prediction Rate                                   : 0.101             

INST_RETIRED       : ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 9.493B
BR_RETIRED         : ▇▇▇▇▇▇▇ 1.486B
BR_MIS_PRED_RETIRED: ▏ 1.508M
BR_IMMED_SPEC      : ▇▇▇▇▇▇▇ 1.395B
BR_RETURN_SPEC     : ▏ 105.203M
BR_INDIRECT_SPEC   : ▏ 106.044M

Modified implementation:

==============
Branch Metrics
==============
Branch MPKI                                                  : 0.282             
Branch PKI                                                   : 156.566           
Branch Mis-prediction Rate                                   : 0.180             

INST_RETIRED       : ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 9.444B
BR_RETIRED         : ▇▇▇▇▇▇▇ 1.479B
BR_MIS_PRED_RETIRED: ▏ 2.662M
BR_IMMED_SPEC      : ▇▇▇▇▇▇▇ 1.388B
BR_RETURN_SPEC     : ▏ 104.518M
BR_INDIRECT_SPEC   : ▏ 105.354M


I will retain the current implementation in the next patch.

Thanks,
Pavan.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v4 1/2] ip_frag: optimize key compare and hash generation
  2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
                       ` (2 preceding siblings ...)
  2023-05-30  7:44     ` Ruifeng Wang
@ 2023-05-31  4:26     ` pbhagavatula
  2023-05-31  4:26       ` [PATCH v4 2/2] test: add reassembly perf test pbhagavatula
  2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  3 siblings, 2 replies; 28+ messages in thread
From: pbhagavatula @ 2023-05-31  4:26 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use optimized rte_hash_k32_cmp_eq routine for key comparison for
x86 and ARM64.
Use CRC instructions for hash generation on ARM64.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
On Neoverse-N2, performance improved by 10% when measured with
examples/ip_reassembly.

 v4 Changes:
 - Fix compilation failures (sys/queue)
 - Update test case to use proper macros.
 v3 Changes:
 - Drop NEON patch.
 v2 Changes:
 - Fix compilation failure with non ARM64/x86 targets

 lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
 lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
 lib/ip_frag/ip_frag_common.h   | 14 ++++++++++++++
 lib/ip_frag/ip_frag_internal.c |  4 ++--
 4 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
index e9e26f9abd..a3e85635eb 100644
--- a/lib/hash/rte_cmp_arm64.h
+++ b/lib/hash/rte_cmp_arm64.h
@@ -3,7 +3,7 @@
  */

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 		    size_t key_len __rte_unused)
 {
@@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 	return !(x0 == 0 && x1 == 0);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
index 13a5836351..ddfbef462f 100644
--- a/lib/hash/rte_cmp_x86.h
+++ b/lib/hash/rte_cmp_x86.h
@@ -5,7 +5,7 @@
 #include <rte_vect.h>

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
 {
 	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
@@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
 	return !_mm_test_all_zeros(x, x);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
index 0d8ce6a1e1..7d6c1aa98d 100644
--- a/lib/ip_frag/ip_frag_common.h
+++ b/lib/ip_frag/ip_frag_common.h
@@ -7,6 +7,14 @@

 #include <sys/queue.h>

+#include <rte_common.h>
+
+#if defined(RTE_ARCH_ARM64)
+#include <rte_cmp_arm64.h>
+#elif defined(RTE_ARCH_X86)
+#include <rte_cmp_x86.h>
+#endif
+
 #include "rte_ip_frag.h"
 #include "ip_reassembly.h"

@@ -75,12 +83,18 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
 static inline uint64_t
 ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
 {
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
+	return (k1->id_key_len != k2->id_key_len) ||
+	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
+					     rte_hash_k32_cmp_eq(k1, k2, 32));
+#else
 	uint32_t i;
 	uint64_t val;
 	val = k1->id_key_len ^ k2->id_key_len;
 	for (i = 0; i < k1->key_len; i++)
 		val |= k1->src_dst[i] ^ k2->src_dst[i];
 	return val;
+#endif
 }

 /*
diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index b436a4c931..7cbef647df 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *)&key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(key->id, v);
@@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *) &key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(p[2], v);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v4 2/2] test: add reassembly perf test
  2023-05-31  4:26     ` [PATCH v4 " pbhagavatula
@ 2023-05-31  4:26       ` pbhagavatula
  2023-06-05 11:12         ` Константин Ананьев
  2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  1 sibling, 1 reply; 28+ messages in thread
From: pbhagavatula @ 2023-05-31  4:26 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh, Amit Prakash Shukla

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
Each test is performed with variable number of fragments per flow,
either ordered or unordered fragments and interleaved flows.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reviewed-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Tested-by: Amit Prakash Shukla <amitprakashs@marvell.com>
---
 app/test/meson.build            |    2 +
 app/test/test_reassembly_perf.c | 1002 +++++++++++++++++++++++++++++++
 2 files changed, 1004 insertions(+)
 create mode 100644 app/test/test_reassembly_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index d96ae7a961..70f320f388 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -108,6 +108,7 @@ test_sources = files(
         'test_rawdev.c',
         'test_rcu_qsbr.c',
         'test_rcu_qsbr_perf.c',
+        'test_reassembly_perf.c',
         'test_reciprocal_division.c',
         'test_reciprocal_division_perf.c',
         'test_red.c',
@@ -297,6 +298,7 @@ perf_test_names = [
         'trace_perf_autotest',
         'ipsec_perf_autotest',
         'thash_perf_autotest',
+        'reassembly_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_reassembly_perf.c b/app/test/test_reassembly_perf.c
new file mode 100644
index 0000000000..f72b5b576e
--- /dev/null
+++ b/app/test/test_reassembly_perf.c
@@ -0,0 +1,1002 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell.
+ */
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_hexdump.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_random.h>
+#include <rte_udp.h>
+
+#include "test.h"
+
+#define MAX_FLOWS	    (1024 * 32)
+#define MAX_BKTS	    MAX_FLOWS
+#define MAX_ENTRIES_PER_BKT 16
+#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define MIN_FRAGMENTS	    2
+#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
+
+#define MAX_PKT_LEN 2048
+#define MAX_TTL_MS  (5 * MS_PER_S)
+
+/* use RFC863 Discard Protocol */
+#define UDP_SRC_PORT 9
+#define UDP_DST_PORT 9
+
+/* use RFC5735 / RFC2544 reserved network test addresses */
+#define IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x))
+#define IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 15) | (x))
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#define IP6_VERSION 6
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+
+static struct rte_ip_frag_tbl *frag_tbl;
+static struct rte_mempool *pkt_pool;
+static struct rte_mbuf *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
+static uint8_t frag_per_flow[MAX_FLOWS];
+static uint32_t flow_cnt;
+
+#define FILL_MODE_LINEAR      0
+#define FILL_MODE_RANDOM      1
+#define FILL_MODE_INTERLEAVED 2
+
+static int
+reassembly_test_setup(void)
+{
+	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
+
+	frag_tbl = rte_ip_frag_table_create(MAX_BKTS, MAX_ENTRIES_PER_BKT,
+					    MAX_BKTS * MAX_ENTRIES_PER_BKT, max_ttl_cyc,
+					    rte_socket_id());
+	if (frag_tbl == NULL)
+		return TEST_FAILED;
+
+	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
+	pkt_pool = rte_pktmbuf_pool_create(
+		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS, 0, 0,
+		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
+	if (pkt_pool == NULL) {
+		printf("[%s] Failed to create pkt pool\n", __func__);
+		rte_ip_frag_table_destroy(frag_tbl);
+		return TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static void
+reassembly_test_teardown(void)
+{
+	if (frag_tbl != NULL)
+		rte_ip_frag_table_destroy(frag_tbl);
+
+	if (pkt_pool != NULL)
+		rte_mempool_free(pkt_pool);
+}
+
+static void
+randomize_array_positions(void **array, uint8_t sz)
+{
+	void *tmp;
+	int i, j;
+
+	if (sz == 2) {
+		tmp = array[0];
+		array[0] = array[1];
+		array[1] = tmp;
+	} else {
+		for (i = sz - 1; i > 0; i--) {
+			j = rte_rand_max(i + 1);
+			tmp = array[i];
+			array[i] = array[j];
+			array[j] = tmp;
+		}
+	}
+}
+
+static void
+reassembly_print_banner(const char *proto_str)
+{
+	printf("+=============================================================="
+	       "============================================+\n");
+	printf("| %-32s| %-3s : %-58d|\n", proto_str, "Flow Count", MAX_FLOWS);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+	printf("%-17s%-17s%-14s%-14s%-25s%-20s\n", "| Fragment Order",
+	       "| Fragments/Flow", "| Outstanding", "| Cycles/Flow",
+	       "| Cycles/Fragment insert", "| Cycles/Reassembly |");
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint32_t ip_cksum;
+		uint16_t pkt_len;
+		uint16_t *ptr16;
+
+		frag_offset = i * (frag_len / 8);
+
+		if (i == nb_frags - 1)
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+		else
+			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv4_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
+		ip_hdr->time_to_live = IP_DEFTTL;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id =
+			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
+		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
+		/* Using more than 32K flows will modify the 2nd octect of the IP. */
+		ip_hdr->src_addr = rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
+		ip_hdr->dst_addr = rte_cpu_to_be_32(IP_DST_ADDR(flow_id));
+
+		/*
+		 * Compute IP header checksum.
+		 */
+		ptr16 = (unaligned_uint16_t *)ip_hdr;
+		ip_cksum = 0;
+		ip_cksum += ptr16[0];
+		ip_cksum += ptr16[1];
+		ip_cksum += ptr16[2];
+		ip_cksum += ptr16[3];
+		ip_cksum += ptr16[4];
+		ip_cksum += ptr16[6];
+		ip_cksum += ptr16[7];
+		ip_cksum += ptr16[8];
+		ip_cksum += ptr16[9];
+
+		/*
+		 * Reduce 32 bit checksum to 16 bits and complement it.
+		 */
+		ip_cksum = ((ip_cksum & 0xFFFF0000) >> 16) +
+			   (ip_cksum & 0x0000FFFF);
+		if (ip_cksum > 65535)
+			ip_cksum -= 65535;
+		ip_cksum = (~ip_cksum) & 0x0000FFFF;
+		if (ip_cksum == 0)
+			ip_cksum = 0xFFFF;
+		ip_hdr->hdr_checksum = (uint16_t)ip_cksum;
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len = sizeof(struct rte_ipv4_hdr);
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static uint8_t
+get_rand_frags(uint8_t max_frag)
+{
+	uint8_t frags = rte_rand_max(max_frag + 1);
+
+	return frags <= 1 ? MIN_FRAGMENTS : frags;
+}
+
+static int
+ipv4_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+ipv6_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct ipv6_extension_fragment *frag_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint16_t pkt_len;
+
+		frag_offset = i * (frag_len / 8);
+		frag_offset <<= 3;
+		if (i == nb_frags - 1) {
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 0);
+		} else {
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 1);
+		}
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv6_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr) +
+				RTE_IPV6_FRAG_HDR_SIZE);
+		frag_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct ipv6_extension_fragment *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv6_hdr) +
+				     RTE_IPV6_FRAG_HDR_SIZE);
+		ip_hdr->vtc_flow = rte_cpu_to_be_32(IP6_VERSION << 28);
+		ip_hdr->payload_len =
+			rte_cpu_to_be_16(pkt_len - sizeof(struct rte_ipv6_hdr));
+		ip_hdr->proto = IPPROTO_FRAGMENT;
+		ip_hdr->hop_limits = IP_DEFTTL;
+		memcpy(ip_hdr->src_addr, ip6_addr, sizeof(ip_hdr->src_addr));
+		memcpy(ip_hdr->dst_addr, ip6_addr, sizeof(ip_hdr->dst_addr));
+		ip_hdr->src_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->src_addr[7] |= 0x10;
+		ip_hdr->src_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->src_addr[9] = flow_id & 0xff;
+
+		ip_hdr->dst_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->dst_addr[7] |= 0x20;
+		ip_hdr->dst_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->dst_addr[9] = flow_id & 0xff;
+
+		frag_hdr->next_header = IPPROTO_UDP;
+		frag_hdr->reserved = 0;
+		frag_hdr->frag_data = rte_cpu_to_be_16(frag_offset);
+		frag_hdr->id = rte_cpu_to_be_32(flow_id + 1);
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len =
+			sizeof(struct rte_ipv6_hdr) + RTE_IPV6_FRAG_HDR_SIZE;
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static int
+ipv6_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+frag_pkt_teardown(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < flow_cnt; i++)
+		rte_pktmbuf_free(mbufs[i][0]);
+}
+
+static void
+reassembly_print_stats(int8_t nb_frags, uint8_t fill_order,
+		       uint32_t outstanding, uint64_t cyc_per_flow,
+		       uint64_t cyc_per_frag_insert,
+		       uint64_t cyc_per_reassembly)
+{
+	char frag_str[8], order_str[12];
+
+	if (nb_frags > 0)
+		snprintf(frag_str, sizeof(frag_str), "%d", nb_frags);
+	else
+		snprintf(frag_str, sizeof(frag_str), "RANDOM");
+
+	switch (fill_order) {
+	case FILL_MODE_LINEAR:
+		snprintf(order_str, sizeof(order_str), "LINEAR");
+		break;
+	case FILL_MODE_RANDOM:
+		snprintf(order_str, sizeof(order_str), "RANDOM");
+		break;
+	case FILL_MODE_INTERLEAVED:
+		snprintf(order_str, sizeof(order_str), "INTERLEAVED");
+		break;
+	default:
+		break;
+	}
+
+	printf("| %-14s | %-14s | %-11d | %-11" PRIu64 " | %-22" PRIu64
+	       " | %-17" PRIu64 " |\n",
+	       order_str, frag_str, outstanding, cyc_per_flow,
+	       cyc_per_frag_insert, cyc_per_reassembly);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+join_array(struct rte_mbuf **dest_arr, struct rte_mbuf **src_arr,
+	   uint8_t offset, uint8_t sz)
+{
+	int i, j;
+
+	for (i = offset, j = 0; j < sz; i++, j++)
+		dest_arr[i] = src_arr[j];
+}
+
+static int
+ipv4_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_interleaved_flows_perf(uint8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_interleaved_flows_perf(int8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv4_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv4_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv4_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv4_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv4_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+ipv6_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv6_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv6_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv6_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv6_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv6_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+test_reassembly_perf(void)
+{
+	int8_t nb_fragments[] = {2, 3, MAX_FRAGMENTS, -1 /* Random */};
+	uint8_t order_type[] = {FILL_MODE_LINEAR, FILL_MODE_RANDOM};
+	uint32_t outstanding[] = {100, 500, 1000, 2000, 3000};
+	uint32_t i, j;
+	int rc;
+
+	rc = reassembly_test_setup();
+	if (rc)
+		return rc;
+
+	reassembly_print_banner("IPV4");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv4_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv4_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	printf("\n");
+	reassembly_print_banner("IPV6");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv6_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv6_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	reassembly_test_teardown();
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(reassembly_perf_autotest, test_reassembly_perf);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v5 1/2] ip_frag: optimize key compare and hash generation
  2023-05-31  4:26     ` [PATCH v4 " pbhagavatula
  2023-05-31  4:26       ` [PATCH v4 2/2] test: add reassembly perf test pbhagavatula
@ 2023-06-02 17:01       ` pbhagavatula
  2023-06-02 17:01         ` [PATCH v5 2/2] test: add reassembly perf test pbhagavatula
                           ` (3 more replies)
  1 sibling, 4 replies; 28+ messages in thread
From: pbhagavatula @ 2023-06-02 17:01 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd, Ruifeng Wang, Yipeng Wang,
	Sameh Gobriel, Bruce Richardson, Vladimir Medvedkin,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use optimized rte_hash_k32_cmp_eq routine for key comparison for
x86 and ARM64.
Use CRC instructions for hash generation on ARM64.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
On Neoverse-N2, performance improved by 10% when measured with
examples/ip_reassembly.

 v5 Changes:
 - Fix spellcheck.
 v4 Changes:
 - Fix compilation failures (sys/queue)
 - Update test case to use proper macros.
 v3 Changes:
 - Drop NEON patch.
 v2 Changes:
 - Fix compilation failure with non ARM64/x86 targets

 lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
 lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
 lib/ip_frag/ip_frag_common.h   | 14 ++++++++++++++
 lib/ip_frag/ip_frag_internal.c |  4 ++--
 4 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
index e9e26f9abd..a3e85635eb 100644
--- a/lib/hash/rte_cmp_arm64.h
+++ b/lib/hash/rte_cmp_arm64.h
@@ -3,7 +3,7 @@
  */

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 		    size_t key_len __rte_unused)
 {
@@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 	return !(x0 == 0 && x1 == 0);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
index 13a5836351..ddfbef462f 100644
--- a/lib/hash/rte_cmp_x86.h
+++ b/lib/hash/rte_cmp_x86.h
@@ -5,7 +5,7 @@
 #include <rte_vect.h>

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
 {
 	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
@@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
 	return !_mm_test_all_zeros(x, x);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
index 0d8ce6a1e1..7d6c1aa98d 100644
--- a/lib/ip_frag/ip_frag_common.h
+++ b/lib/ip_frag/ip_frag_common.h
@@ -7,6 +7,14 @@

 #include <sys/queue.h>

+#include <rte_common.h>
+
+#if defined(RTE_ARCH_ARM64)
+#include <rte_cmp_arm64.h>
+#elif defined(RTE_ARCH_X86)
+#include <rte_cmp_x86.h>
+#endif
+
 #include "rte_ip_frag.h"
 #include "ip_reassembly.h"

@@ -75,12 +83,18 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
 static inline uint64_t
 ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
 {
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
+	return (k1->id_key_len != k2->id_key_len) ||
+	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
+					     rte_hash_k32_cmp_eq(k1, k2, 32));
+#else
 	uint32_t i;
 	uint64_t val;
 	val = k1->id_key_len ^ k2->id_key_len;
 	for (i = 0; i < k1->key_len; i++)
 		val |= k1->src_dst[i] ^ k2->src_dst[i];
 	return val;
+#endif
 }

 /*
diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index b436a4c931..7cbef647df 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *)&key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(key->id, v);
@@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *) &key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(p[2], v);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v5 2/2] test: add reassembly perf test
  2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
@ 2023-06-02 17:01         ` pbhagavatula
  2023-06-27  9:36           ` Konstantin Ananyev
  2023-06-05 11:09         ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation Константин Ананьев
                           ` (2 subsequent siblings)
  3 siblings, 1 reply; 28+ messages in thread
From: pbhagavatula @ 2023-06-02 17:01 UTC (permalink / raw)
  To: jerinj, Honnappa.Nagarahalli, nd
  Cc: dev, Pavan Nikhilesh, Amit Prakash Shukla

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
Each test is performed with variable number of fragments per flow,
either ordered or unordered fragments and interleaved flows.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reviewed-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Tested-by: Amit Prakash Shukla <amitprakashs@marvell.com>
---
 app/test/meson.build            |    2 +
 app/test/test_reassembly_perf.c | 1002 +++++++++++++++++++++++++++++++
 2 files changed, 1004 insertions(+)
 create mode 100644 app/test/test_reassembly_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index d96ae7a961..70f320f388 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -108,6 +108,7 @@ test_sources = files(
         'test_rawdev.c',
         'test_rcu_qsbr.c',
         'test_rcu_qsbr_perf.c',
+        'test_reassembly_perf.c',
         'test_reciprocal_division.c',
         'test_reciprocal_division_perf.c',
         'test_red.c',
@@ -297,6 +298,7 @@ perf_test_names = [
         'trace_perf_autotest',
         'ipsec_perf_autotest',
         'thash_perf_autotest',
+        'reassembly_perf_autotest',
 ]

 driver_test_names = [
diff --git a/app/test/test_reassembly_perf.c b/app/test/test_reassembly_perf.c
new file mode 100644
index 0000000000..24e37a1074
--- /dev/null
+++ b/app/test/test_reassembly_perf.c
@@ -0,0 +1,1002 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell.
+ */
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_hexdump.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_random.h>
+#include <rte_udp.h>
+
+#include "test.h"
+
+#define MAX_FLOWS	    (1024 * 32)
+#define MAX_BKTS	    MAX_FLOWS
+#define MAX_ENTRIES_PER_BKT 16
+#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define MIN_FRAGMENTS	    2
+#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
+
+#define MAX_PKT_LEN 2048
+#define MAX_TTL_MS  (5 * MS_PER_S)
+
+/* use RFC863 Discard Protocol */
+#define UDP_SRC_PORT 9
+#define UDP_DST_PORT 9
+
+/* use RFC5735 / RFC2544 reserved network test addresses */
+#define IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x))
+#define IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 15) | (x))
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#define IP6_VERSION 6
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+
+static struct rte_ip_frag_tbl *frag_tbl;
+static struct rte_mempool *pkt_pool;
+static struct rte_mbuf *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
+static uint8_t frag_per_flow[MAX_FLOWS];
+static uint32_t flow_cnt;
+
+#define FILL_MODE_LINEAR      0
+#define FILL_MODE_RANDOM      1
+#define FILL_MODE_INTERLEAVED 2
+
+static int
+reassembly_test_setup(void)
+{
+	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
+
+	frag_tbl = rte_ip_frag_table_create(MAX_BKTS, MAX_ENTRIES_PER_BKT,
+					    MAX_BKTS * MAX_ENTRIES_PER_BKT, max_ttl_cyc,
+					    rte_socket_id());
+	if (frag_tbl == NULL)
+		return TEST_FAILED;
+
+	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
+	pkt_pool = rte_pktmbuf_pool_create(
+		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS, 0, 0,
+		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
+	if (pkt_pool == NULL) {
+		printf("[%s] Failed to create pkt pool\n", __func__);
+		rte_ip_frag_table_destroy(frag_tbl);
+		return TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static void
+reassembly_test_teardown(void)
+{
+	if (frag_tbl != NULL)
+		rte_ip_frag_table_destroy(frag_tbl);
+
+	if (pkt_pool != NULL)
+		rte_mempool_free(pkt_pool);
+}
+
+static void
+randomize_array_positions(void **array, uint8_t sz)
+{
+	void *tmp;
+	int i, j;
+
+	if (sz == 2) {
+		tmp = array[0];
+		array[0] = array[1];
+		array[1] = tmp;
+	} else {
+		for (i = sz - 1; i > 0; i--) {
+			j = rte_rand_max(i + 1);
+			tmp = array[i];
+			array[i] = array[j];
+			array[j] = tmp;
+		}
+	}
+}
+
+static void
+reassembly_print_banner(const char *proto_str)
+{
+	printf("+=============================================================="
+	       "============================================+\n");
+	printf("| %-32s| %-3s : %-58d|\n", proto_str, "Flow Count", MAX_FLOWS);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+	printf("%-17s%-17s%-14s%-14s%-25s%-20s\n", "| Fragment Order",
+	       "| Fragments/Flow", "| Outstanding", "| Cycles/Flow",
+	       "| Cycles/Fragment insert", "| Cycles/Reassembly |");
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint32_t ip_cksum;
+		uint16_t pkt_len;
+		uint16_t *ptr16;
+
+		frag_offset = i * (frag_len / 8);
+
+		if (i == nb_frags - 1)
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+		else
+			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv4_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
+		ip_hdr->time_to_live = IP_DEFTTL;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id =
+			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
+		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
+		/* Using more than 32K flows will modify the 2nd octet of the IP. */
+		ip_hdr->src_addr = rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
+		ip_hdr->dst_addr = rte_cpu_to_be_32(IP_DST_ADDR(flow_id));
+
+		/*
+		 * Compute IP header checksum.
+		 */
+		ptr16 = (unaligned_uint16_t *)ip_hdr;
+		ip_cksum = 0;
+		ip_cksum += ptr16[0];
+		ip_cksum += ptr16[1];
+		ip_cksum += ptr16[2];
+		ip_cksum += ptr16[3];
+		ip_cksum += ptr16[4];
+		ip_cksum += ptr16[6];
+		ip_cksum += ptr16[7];
+		ip_cksum += ptr16[8];
+		ip_cksum += ptr16[9];
+
+		/*
+		 * Reduce 32 bit checksum to 16 bits and complement it.
+		 */
+		ip_cksum = ((ip_cksum & 0xFFFF0000) >> 16) +
+			   (ip_cksum & 0x0000FFFF);
+		if (ip_cksum > 65535)
+			ip_cksum -= 65535;
+		ip_cksum = (~ip_cksum) & 0x0000FFFF;
+		if (ip_cksum == 0)
+			ip_cksum = 0xFFFF;
+		ip_hdr->hdr_checksum = (uint16_t)ip_cksum;
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len = sizeof(struct rte_ipv4_hdr);
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static uint8_t
+get_rand_frags(uint8_t max_frag)
+{
+	uint8_t frags = rte_rand_max(max_frag + 1);
+
+	return frags <= 1 ? MIN_FRAGMENTS : frags;
+}
+
+static int
+ipv4_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+ipv6_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct ipv6_extension_fragment *frag_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint16_t pkt_len;
+
+		frag_offset = i * (frag_len / 8);
+		frag_offset <<= 3;
+		if (i == nb_frags - 1) {
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 0);
+		} else {
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 1);
+		}
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv6_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr) +
+				RTE_IPV6_FRAG_HDR_SIZE);
+		frag_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct ipv6_extension_fragment *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv6_hdr) +
+				     RTE_IPV6_FRAG_HDR_SIZE);
+		ip_hdr->vtc_flow = rte_cpu_to_be_32(IP6_VERSION << 28);
+		ip_hdr->payload_len =
+			rte_cpu_to_be_16(pkt_len - sizeof(struct rte_ipv6_hdr));
+		ip_hdr->proto = IPPROTO_FRAGMENT;
+		ip_hdr->hop_limits = IP_DEFTTL;
+		memcpy(ip_hdr->src_addr, ip6_addr, sizeof(ip_hdr->src_addr));
+		memcpy(ip_hdr->dst_addr, ip6_addr, sizeof(ip_hdr->dst_addr));
+		ip_hdr->src_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->src_addr[7] |= 0x10;
+		ip_hdr->src_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->src_addr[9] = flow_id & 0xff;
+
+		ip_hdr->dst_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->dst_addr[7] |= 0x20;
+		ip_hdr->dst_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->dst_addr[9] = flow_id & 0xff;
+
+		frag_hdr->next_header = IPPROTO_UDP;
+		frag_hdr->reserved = 0;
+		frag_hdr->frag_data = rte_cpu_to_be_16(frag_offset);
+		frag_hdr->id = rte_cpu_to_be_32(flow_id + 1);
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len =
+			sizeof(struct rte_ipv6_hdr) + RTE_IPV6_FRAG_HDR_SIZE;
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static int
+ipv6_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+frag_pkt_teardown(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < flow_cnt; i++)
+		rte_pktmbuf_free(mbufs[i][0]);
+}
+
+static void
+reassembly_print_stats(int8_t nb_frags, uint8_t fill_order,
+		       uint32_t outstanding, uint64_t cyc_per_flow,
+		       uint64_t cyc_per_frag_insert,
+		       uint64_t cyc_per_reassembly)
+{
+	char frag_str[8], order_str[12];
+
+	if (nb_frags > 0)
+		snprintf(frag_str, sizeof(frag_str), "%d", nb_frags);
+	else
+		snprintf(frag_str, sizeof(frag_str), "RANDOM");
+
+	switch (fill_order) {
+	case FILL_MODE_LINEAR:
+		snprintf(order_str, sizeof(order_str), "LINEAR");
+		break;
+	case FILL_MODE_RANDOM:
+		snprintf(order_str, sizeof(order_str), "RANDOM");
+		break;
+	case FILL_MODE_INTERLEAVED:
+		snprintf(order_str, sizeof(order_str), "INTERLEAVED");
+		break;
+	default:
+		break;
+	}
+
+	printf("| %-14s | %-14s | %-11d | %-11" PRIu64 " | %-22" PRIu64
+	       " | %-17" PRIu64 " |\n",
+	       order_str, frag_str, outstanding, cyc_per_flow,
+	       cyc_per_frag_insert, cyc_per_reassembly);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+join_array(struct rte_mbuf **dest_arr, struct rte_mbuf **src_arr,
+	   uint8_t offset, uint8_t sz)
+{
+	int i, j;
+
+	for (i = offset, j = 0; j < sz; i++, j++)
+		dest_arr[i] = src_arr[j];
+}
+
+static int
+ipv4_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_interleaved_flows_perf(uint8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_interleaved_flows_perf(int8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv4_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv4_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv4_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv4_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv4_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+ipv6_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv6_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv6_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv6_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv6_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv6_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+test_reassembly_perf(void)
+{
+	int8_t nb_fragments[] = {2, 3, MAX_FRAGMENTS, -1 /* Random */};
+	uint8_t order_type[] = {FILL_MODE_LINEAR, FILL_MODE_RANDOM};
+	uint32_t outstanding[] = {100, 500, 1000, 2000, 3000};
+	uint32_t i, j;
+	int rc;
+
+	rc = reassembly_test_setup();
+	if (rc)
+		return rc;
+
+	reassembly_print_banner("IPV4");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv4_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv4_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	printf("\n");
+	reassembly_print_banner("IPV6");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv6_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv6_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	reassembly_test_teardown();
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(reassembly_perf_autotest, test_reassembly_perf);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 1/2] ip_frag: optimize key compare and hash generation
  2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-06-02 17:01         ` [PATCH v5 2/2] test: add reassembly perf test pbhagavatula
@ 2023-06-05 11:09         ` Константин Ананьев
  2023-06-27  9:23         ` Konstantin Ananyev
  2023-07-11 16:52         ` [PATCH v6 " pbhagavatula
  3 siblings, 0 replies; 28+ messages in thread
From: Константин Ананьев @ 2023-06-05 11:09 UTC (permalink / raw)
  To: pbhagavatula, jerinj, Honnappa.Nagarahalli@arm.com, nd,
	Ruifeng Wang, Yipeng Wang, Sameh Gobriel, Bruce Richardson,
	Vladimir Medvedkin
  Cc: dev

[-- Attachment #1: Type: text/html, Size: 2335 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] test: add reassembly perf test
  2023-05-31  4:26       ` [PATCH v4 2/2] test: add reassembly perf test pbhagavatula
@ 2023-06-05 11:12         ` Константин Ананьев
  0 siblings, 0 replies; 28+ messages in thread
From: Константин Ананьев @ 2023-06-05 11:12 UTC (permalink / raw)
  To: pbhagavatula, jerinj; +Cc: dev, Amit Prakash Shukla

[-- Attachment #1: Type: text/html, Size: 2122 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 1/2] ip_frag: optimize key compare and hash generation
  2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
  2023-06-02 17:01         ` [PATCH v5 2/2] test: add reassembly perf test pbhagavatula
  2023-06-05 11:09         ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation Константин Ананьев
@ 2023-06-27  9:23         ` Konstantin Ananyev
  2023-07-11 16:52         ` [PATCH v6 " pbhagavatula
  3 siblings, 0 replies; 28+ messages in thread
From: Konstantin Ananyev @ 2023-06-27  9:23 UTC (permalink / raw)
  To: pbhagavatula
  Cc: Honnappa.Nagarahalli, bruce.richardson, dev, jerinj,
	konstantin.v.ananyev, nd, ruifeng.wang, sameh.gobriel,
	vladimir.medvedkin, yipeng1.wang


> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Use optimized rte_hash_k32_cmp_eq routine for key comparison for
> x86 and ARM64.
> Use CRC instructions for hash generation on ARM64.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
> On Neoverse-N2, performance improved by 10% when measured with
> examples/ip_reassembly.
> 
>  v5 Changes:
>  - Fix spellcheck.
>  v4 Changes:
>  - Fix compilation failures (sys/queue)
>  - Update test case to use proper macros.
>  v3 Changes:
>  - Drop NEON patch.
>  v2 Changes:
>  - Fix compilation failure with non ARM64/x86 targets
> 
>  lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
>  lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
>  lib/ip_frag/ip_frag_common.h   | 14 ++++++++++++++
>  lib/ip_frag/ip_frag_internal.c |  4 ++--
>  4 files changed, 32 insertions(+), 18 deletions(-)
> 
> diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
> index e9e26f9abd..a3e85635eb 100644
> --- a/lib/hash/rte_cmp_arm64.h
> +++ b/lib/hash/rte_cmp_arm64.h
> @@ -3,7 +3,7 @@
>   */
> 
>  /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
> -static int
> +static inline int
>  rte_hash_k16_cmp_eq(const void *key1, const void *key2,
>  		    size_t key_len __rte_unused)
>  {
> @@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
>  	return !(x0 == 0 && x1 == 0);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
> @@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 16, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
> @@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 32, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
> @@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 32, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> @@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 64, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> @@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 64, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> @@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 96, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
> index 13a5836351..ddfbef462f 100644
> --- a/lib/hash/rte_cmp_x86.h
> +++ b/lib/hash/rte_cmp_x86.h
> @@ -5,7 +5,7 @@
>  #include <rte_vect.h>
> 
>  /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
> -static int
> +static inline int
>  rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
>  {
>  	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
> @@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
>  	return !_mm_test_all_zeros(x, x);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
> @@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 16, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
> @@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 32, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
> @@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 32, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> @@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 64, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> @@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 64, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> @@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  				(const char *) key2 + 96, key_len);
>  }
> 
> -static int
> +static inline int
>  rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
>  {
>  	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
> diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
> index 0d8ce6a1e1..7d6c1aa98d 100644
> --- a/lib/ip_frag/ip_frag_common.h
> +++ b/lib/ip_frag/ip_frag_common.h
> @@ -7,6 +7,14 @@
> 
>  #include <sys/queue.h>
> 
> +#include <rte_common.h>
> +
> +#if defined(RTE_ARCH_ARM64)
> +#include <rte_cmp_arm64.h>
> +#elif defined(RTE_ARCH_X86)
> +#include <rte_cmp_x86.h>
> +#endif
> +
>  #include "rte_ip_frag.h"
>  #include "ip_reassembly.h"
> 
> @@ -75,12 +83,18 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
>  static inline uint64_t
>  ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
>  {
> +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
> +	return (k1->id_key_len != k2->id_key_len) ||
> +	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
> +					     rte_hash_k32_cmp_eq(k1, k2, 32));
> +#else
>  	uint32_t i;
>  	uint64_t val;
>  	val = k1->id_key_len ^ k2->id_key_len;
>  	for (i = 0; i < k1->key_len; i++)
>  		val |= k1->src_dst[i] ^ k2->src_dst[i];
>  	return val;
> +#endif
>  }
> 
>  /*
> diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
> index b436a4c931..7cbef647df 100644
> --- a/lib/ip_frag/ip_frag_internal.c
> +++ b/lib/ip_frag/ip_frag_internal.c
> @@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)
> 
>  	p = (const uint32_t *)&key->src_dst;
> 
> -#ifdef RTE_ARCH_X86
> +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
>  	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
>  	v = rte_hash_crc_4byte(p[1], v);
>  	v = rte_hash_crc_4byte(key->id, v);
> @@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)
> 
>  	p = (const uint32_t *) &key->src_dst;
> 
> -#ifdef RTE_ARCH_X86
> +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
>  	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
>  	v = rte_hash_crc_4byte(p[1], v);
>  	v = rte_hash_crc_4byte(p[2], v);
> --

Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>

> 2.25.1
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 2/2] test: add reassembly perf test
  2023-06-02 17:01         ` [PATCH v5 2/2] test: add reassembly perf test pbhagavatula
@ 2023-06-27  9:36           ` Konstantin Ananyev
  0 siblings, 0 replies; 28+ messages in thread
From: Konstantin Ananyev @ 2023-06-27  9:36 UTC (permalink / raw)
  To: pbhagavatula; +Cc: Honnappa.Nagarahalli, amitprakashs, dev, jerinj, nd

> rom: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
> Each test is performed with variable number of fragments per flow,
> either ordered or unordered fragments and interleaved flows.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Reviewed-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> Tested-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> ---
>  app/test/meson.build            |    2 +
>  app/test/test_reassembly_perf.c | 1002 +++++++++++++++++++++++++++++++
>  2 files changed, 1004 insertions(+)
>  create mode 100644 app/test/test_reassembly_perf.c
> --

Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>

> 2.25.1



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v6 1/2] ip_frag: optimize key compare and hash generation
  2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
                           ` (2 preceding siblings ...)
  2023-06-27  9:23         ` Konstantin Ananyev
@ 2023-07-11 16:52         ` pbhagavatula
  2023-07-11 16:52           ` [PATCH v6 2/2] test: add reassembly perf test pbhagavatula
  2023-07-12 14:59           ` [PATCH v6 1/2] ip_frag: optimize key compare and hash generation Thomas Monjalon
  3 siblings, 2 replies; 28+ messages in thread
From: pbhagavatula @ 2023-07-11 16:52 UTC (permalink / raw)
  To: jerinj, Ruifeng Wang, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use optimized rte_hash_k32_cmp_eq routine for key comparison for
x86 and ARM64.
Use CRC instructions for hash generation on ARM64.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
---
On Neoverse-N2, performance improved by 10% when measured with
examples/ip_reassembly.

 v6 Changes:
 - Fix compilation with mingw
 v5 Changes:
 - Fix spellcheck.
 v4 Changes:
 - Fix compilation failures (sys/queue)
 - Update test case to use proper macros.
 v3 Changes:
 - Drop NEON patch.
 v2 Changes:
 - Fix compilation failure with non ARM64/x86 targets

 lib/hash/rte_cmp_arm64.h       | 16 ++++++++--------
 lib/hash/rte_cmp_x86.h         | 16 ++++++++--------
 lib/ip_frag/ip_frag_common.h   | 14 ++++++++++++++
 lib/ip_frag/ip_frag_internal.c |  4 ++--
 4 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/lib/hash/rte_cmp_arm64.h b/lib/hash/rte_cmp_arm64.h
index e9e26f9abd..a3e85635eb 100644
--- a/lib/hash/rte_cmp_arm64.h
+++ b/lib/hash/rte_cmp_arm64.h
@@ -3,7 +3,7 @@
  */

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 		    size_t key_len __rte_unused)
 {
@@ -24,7 +24,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2,
 	return !(x0 == 0 && x1 == 0);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -32,7 +32,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -42,7 +42,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -50,7 +50,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -58,7 +58,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -66,7 +66,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -76,7 +76,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/hash/rte_cmp_x86.h b/lib/hash/rte_cmp_x86.h
index 13a5836351..ddfbef462f 100644
--- a/lib/hash/rte_cmp_x86.h
+++ b/lib/hash/rte_cmp_x86.h
@@ -5,7 +5,7 @@
 #include <rte_vect.h>

 /* Functions to compare multiple of 16 byte keys (up to 128 bytes) */
-static int
+static inline int
 rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unused)
 {
 	const __m128i k1 = _mm_loadu_si128((const __m128i *) key1);
@@ -15,7 +15,7 @@ rte_hash_k16_cmp_eq(const void *key1, const void *key2, size_t key_len __rte_unu
 	return !_mm_test_all_zeros(x, x);
 }

-static int
+static inline int
 rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -23,7 +23,7 @@ rte_hash_k32_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 16, key_len);
 }

-static int
+static inline int
 rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k16_cmp_eq(key1, key2, key_len) ||
@@ -33,7 +33,7 @@ rte_hash_k48_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k32_cmp_eq(key1, key2, key_len) ||
@@ -41,7 +41,7 @@ rte_hash_k64_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 32, key_len);
 }

-static int
+static inline int
 rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -49,7 +49,7 @@ rte_hash_k80_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -57,7 +57,7 @@ rte_hash_k96_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 64, key_len);
 }

-static int
+static inline int
 rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
@@ -67,7 +67,7 @@ rte_hash_k112_cmp_eq(const void *key1, const void *key2, size_t key_len)
 				(const char *) key2 + 96, key_len);
 }

-static int
+static inline int
 rte_hash_k128_cmp_eq(const void *key1, const void *key2, size_t key_len)
 {
 	return rte_hash_k64_cmp_eq(key1, key2, key_len) ||
diff --git a/lib/ip_frag/ip_frag_common.h b/lib/ip_frag/ip_frag_common.h
index 0d8ce6a1e1..7d6c1aa98d 100644
--- a/lib/ip_frag/ip_frag_common.h
+++ b/lib/ip_frag/ip_frag_common.h
@@ -7,6 +7,14 @@

 #include <sys/queue.h>

+#include <rte_common.h>
+
+#if defined(RTE_ARCH_ARM64)
+#include <rte_cmp_arm64.h>
+#elif defined(RTE_ARCH_X86)
+#include <rte_cmp_x86.h>
+#endif
+
 #include "rte_ip_frag.h"
 #include "ip_reassembly.h"

@@ -75,12 +83,18 @@ ip_frag_key_invalidate(struct ip_frag_key * key)
 static inline uint64_t
 ip_frag_key_cmp(const struct ip_frag_key * k1, const struct ip_frag_key * k2)
 {
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
+	return (k1->id_key_len != k2->id_key_len) ||
+	       (k1->key_len == IPV4_KEYLEN ? k1->src_dst[0] != k2->src_dst[0] :
+					     rte_hash_k32_cmp_eq(k1, k2, 32));
+#else
 	uint32_t i;
 	uint64_t val;
 	val = k1->id_key_len ^ k2->id_key_len;
 	for (i = 0; i < k1->key_len; i++)
 		val |= k1->src_dst[i] ^ k2->src_dst[i];
 	return val;
+#endif
 }

 /*
diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index b436a4c931..7cbef647df 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -45,7 +45,7 @@ ipv4_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *)&key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(key->id, v);
@@ -66,7 +66,7 @@ ipv6_frag_hash(const struct ip_frag_key *key, uint32_t *v1, uint32_t *v2)

 	p = (const uint32_t *) &key->src_dst;

-#ifdef RTE_ARCH_X86
+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 	v = rte_hash_crc_4byte(p[0], PRIME_VALUE);
 	v = rte_hash_crc_4byte(p[1], v);
 	v = rte_hash_crc_4byte(p[2], v);
--
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v6 2/2] test: add reassembly perf test
  2023-07-11 16:52         ` [PATCH v6 " pbhagavatula
@ 2023-07-11 16:52           ` pbhagavatula
  2023-07-12 14:59           ` [PATCH v6 1/2] ip_frag: optimize key compare and hash generation Thomas Monjalon
  1 sibling, 0 replies; 28+ messages in thread
From: pbhagavatula @ 2023-07-11 16:52 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh, Amit Prakash Shukla, Konstantin Ananyev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add reassembly perf autotest for both ipv4 and ipv6 reassembly.
Each test is performed with variable number of fragments per flow,
either ordered or unordered fragments and interleaved flows.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Reviewed-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Tested-by: Amit Prakash Shukla <amitprakashs@marvell.com>
Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
---
 app/test/meson.build            |    2 +
 app/test/test_reassembly_perf.c | 1003 +++++++++++++++++++++++++++++++
 2 files changed, 1005 insertions(+)
 create mode 100644 app/test/test_reassembly_perf.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 3e0a2360a3..b89cf0368f 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -106,6 +106,7 @@ test_sources = files(
         'test_rawdev.c',
         'test_rcu_qsbr.c',
         'test_rcu_qsbr_perf.c',
+        'test_reassembly_perf.c',
         'test_reciprocal_division.c',
         'test_reciprocal_division_perf.c',
         'test_red.c',
@@ -296,6 +297,7 @@ perf_test_names = [
         'trace_perf_autotest',
         'ipsec_perf_autotest',
         'thash_perf_autotest',
+        'reassembly_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_reassembly_perf.c b/app/test/test_reassembly_perf.c
new file mode 100644
index 0000000000..c11b65291f
--- /dev/null
+++ b/app/test/test_reassembly_perf.c
@@ -0,0 +1,1003 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell.
+ */
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_hexdump.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_os_shim.h>
+#include <rte_random.h>
+#include <rte_udp.h>
+
+#include "test.h"
+
+#define MAX_FLOWS	    (1024 * 32)
+#define MAX_BKTS	    MAX_FLOWS
+#define MAX_ENTRIES_PER_BKT 16
+#define MAX_FRAGMENTS	    RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define MIN_FRAGMENTS	    2
+#define MAX_PKTS	    (MAX_FLOWS * MAX_FRAGMENTS)
+
+#define MAX_PKT_LEN 2048
+#define MAX_TTL_MS  (5 * MS_PER_S)
+
+/* use RFC863 Discard Protocol */
+#define UDP_SRC_PORT 9
+#define UDP_DST_PORT 9
+
+/* use RFC5735 / RFC2544 reserved network test addresses */
+#define IP_SRC_ADDR(x) ((198U << 24) | (18 << 16) | (0 << 8) | (x))
+#define IP_DST_ADDR(x) ((198U << 24) | (18 << 16) | (1 << 15) | (x))
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static uint8_t ip6_addr[16] = {32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#define IP6_VERSION 6
+
+#define IP_DEFTTL 64 /* from RFC 1340. */
+
+static struct rte_ip_frag_tbl *frag_tbl;
+static struct rte_mempool *pkt_pool;
+static struct rte_mbuf *mbufs[MAX_FLOWS][MAX_FRAGMENTS];
+static uint8_t frag_per_flow[MAX_FLOWS];
+static uint32_t flow_cnt;
+
+#define FILL_MODE_LINEAR      0
+#define FILL_MODE_RANDOM      1
+#define FILL_MODE_INTERLEAVED 2
+
+static int
+reassembly_test_setup(void)
+{
+	uint64_t max_ttl_cyc = (MAX_TTL_MS * rte_get_timer_hz()) / 1E3;
+
+	frag_tbl = rte_ip_frag_table_create(MAX_BKTS, MAX_ENTRIES_PER_BKT,
+					    MAX_BKTS * MAX_ENTRIES_PER_BKT, max_ttl_cyc,
+					    rte_socket_id());
+	if (frag_tbl == NULL)
+		return TEST_FAILED;
+
+	rte_mbuf_set_user_mempool_ops("ring_mp_mc");
+	pkt_pool = rte_pktmbuf_pool_create(
+		"reassembly_perf_pool", MAX_FLOWS * MAX_FRAGMENTS, 0, 0,
+		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
+	if (pkt_pool == NULL) {
+		printf("[%s] Failed to create pkt pool\n", __func__);
+		rte_ip_frag_table_destroy(frag_tbl);
+		return TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+static void
+reassembly_test_teardown(void)
+{
+	if (frag_tbl != NULL)
+		rte_ip_frag_table_destroy(frag_tbl);
+
+	if (pkt_pool != NULL)
+		rte_mempool_free(pkt_pool);
+}
+
+static void
+randomize_array_positions(void **array, uint8_t sz)
+{
+	void *tmp;
+	int i, j;
+
+	if (sz == 2) {
+		tmp = array[0];
+		array[0] = array[1];
+		array[1] = tmp;
+	} else {
+		for (i = sz - 1; i > 0; i--) {
+			j = rte_rand_max(i + 1);
+			tmp = array[i];
+			array[i] = array[j];
+			array[j] = tmp;
+		}
+	}
+}
+
+static void
+reassembly_print_banner(const char *proto_str)
+{
+	printf("+=============================================================="
+	       "============================================+\n");
+	printf("| %-32s| %-3s : %-58d|\n", proto_str, "Flow Count", MAX_FLOWS);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+	printf("%-17s%-17s%-14s%-14s%-25s%-20s\n", "| Fragment Order",
+	       "| Fragments/Flow", "| Outstanding", "| Cycles/Flow",
+	       "| Cycles/Fragment insert", "| Cycles/Reassembly |");
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+ipv4_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint32_t ip_cksum;
+		uint16_t pkt_len;
+		uint16_t *ptr16;
+
+		frag_offset = i * (frag_len / 8);
+
+		if (i == nb_frags - 1)
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+		else
+			frag_offset |= RTE_IPV4_HDR_MF_FLAG;
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv4_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv4_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv4_hdr));
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = rte_cpu_to_be_16(frag_offset);
+		ip_hdr->time_to_live = IP_DEFTTL;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id =
+			rte_cpu_to_be_16((flow_id + 1) % UINT16_MAX);
+		ip_hdr->total_length = rte_cpu_to_be_16(pkt_len);
+		/* Using more than 32K flows will modify the 2nd octet of the IP. */
+		ip_hdr->src_addr = rte_cpu_to_be_32(IP_SRC_ADDR(flow_id));
+		ip_hdr->dst_addr = rte_cpu_to_be_32(IP_DST_ADDR(flow_id));
+
+		/*
+		 * Compute IP header checksum.
+		 */
+		ptr16 = (unaligned_uint16_t *)ip_hdr;
+		ip_cksum = 0;
+		ip_cksum += ptr16[0];
+		ip_cksum += ptr16[1];
+		ip_cksum += ptr16[2];
+		ip_cksum += ptr16[3];
+		ip_cksum += ptr16[4];
+		ip_cksum += ptr16[6];
+		ip_cksum += ptr16[7];
+		ip_cksum += ptr16[8];
+		ip_cksum += ptr16[9];
+
+		/*
+		 * Reduce 32 bit checksum to 16 bits and complement it.
+		 */
+		ip_cksum = ((ip_cksum & 0xFFFF0000) >> 16) +
+			   (ip_cksum & 0x0000FFFF);
+		if (ip_cksum > 65535)
+			ip_cksum -= 65535;
+		ip_cksum = (~ip_cksum) & 0x0000FFFF;
+		if (ip_cksum == 0)
+			ip_cksum = 0xFFFF;
+		ip_hdr->hdr_checksum = (uint16_t)ip_cksum;
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len = sizeof(struct rte_ipv4_hdr);
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static uint8_t
+get_rand_frags(uint8_t max_frag)
+{
+	uint8_t frags = rte_rand_max(max_frag + 1);
+
+	return frags <= 1 ? MIN_FRAGMENTS : frags;
+}
+
+static int
+ipv4_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv4_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+ipv6_frag_fill_data(struct rte_mbuf **mbuf, uint8_t nb_frags, uint32_t flow_id,
+		    uint8_t fill_mode)
+{
+	struct ipv6_extension_fragment *frag_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ip_hdr;
+	struct rte_udp_hdr *udp_hdr;
+	uint16_t frag_len;
+	uint8_t i;
+
+	frag_len = MAX_PKT_LEN / nb_frags;
+	if (frag_len % 8)
+		frag_len = RTE_ALIGN_MUL_CEIL(frag_len, 8);
+
+	for (i = 0; i < nb_frags; i++) {
+		struct rte_mbuf *frag = mbuf[i];
+		uint16_t frag_offset = 0;
+		uint16_t pkt_len;
+
+		frag_offset = i * (frag_len / 8);
+		frag_offset <<= 3;
+		if (i == nb_frags - 1) {
+			frag_len = MAX_PKT_LEN - (frag_len * (nb_frags - 1));
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 0);
+		} else {
+			frag_offset = RTE_IPV6_SET_FRAG_DATA(frag_offset, 1);
+		}
+
+		rte_pktmbuf_reset_headroom(frag);
+		eth_hdr = rte_pktmbuf_mtod(frag, struct rte_ether_hdr *);
+		ip_hdr = rte_pktmbuf_mtod_offset(frag, struct rte_ipv6_hdr *,
+						 sizeof(struct rte_ether_hdr));
+		udp_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct rte_udp_hdr *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr) +
+				RTE_IPV6_FRAG_HDR_SIZE);
+		frag_hdr = rte_pktmbuf_mtod_offset(
+			frag, struct ipv6_extension_fragment *,
+			sizeof(struct rte_ether_hdr) +
+				sizeof(struct rte_ipv6_hdr));
+
+		rte_ether_unformat_addr("02:00:00:00:00:01",
+					&eth_hdr->dst_addr);
+		rte_ether_unformat_addr("02:00:00:00:00:00",
+					&eth_hdr->src_addr);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6);
+
+		pkt_len = frag_len;
+		/*
+		 * Initialize UDP header.
+		 */
+		if (i == 0) {
+			udp_hdr->src_port = rte_cpu_to_be_16(UDP_SRC_PORT);
+			udp_hdr->dst_port = rte_cpu_to_be_16(UDP_DST_PORT);
+			udp_hdr->dgram_len = rte_cpu_to_be_16(pkt_len);
+			udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
+		}
+
+		/*
+		 * Initialize IP header.
+		 */
+		pkt_len = (uint16_t)(pkt_len + sizeof(struct rte_ipv6_hdr) +
+				     RTE_IPV6_FRAG_HDR_SIZE);
+		ip_hdr->vtc_flow = rte_cpu_to_be_32(IP6_VERSION << 28);
+		ip_hdr->payload_len =
+			rte_cpu_to_be_16(pkt_len - sizeof(struct rte_ipv6_hdr));
+		ip_hdr->proto = IPPROTO_FRAGMENT;
+		ip_hdr->hop_limits = IP_DEFTTL;
+		memcpy(ip_hdr->src_addr, ip6_addr, sizeof(ip_hdr->src_addr));
+		memcpy(ip_hdr->dst_addr, ip6_addr, sizeof(ip_hdr->dst_addr));
+		ip_hdr->src_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->src_addr[7] |= 0x10;
+		ip_hdr->src_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->src_addr[9] = flow_id & 0xff;
+
+		ip_hdr->dst_addr[7] = (flow_id >> 16) & 0xf;
+		ip_hdr->dst_addr[7] |= 0x20;
+		ip_hdr->dst_addr[8] = (flow_id >> 8) & 0xff;
+		ip_hdr->dst_addr[9] = flow_id & 0xff;
+
+		frag_hdr->next_header = IPPROTO_UDP;
+		frag_hdr->reserved = 0;
+		frag_hdr->frag_data = rte_cpu_to_be_16(frag_offset);
+		frag_hdr->id = rte_cpu_to_be_32(flow_id + 1);
+
+		frag->data_len = sizeof(struct rte_ether_hdr) + pkt_len;
+		frag->pkt_len = frag->data_len;
+		frag->l2_len = sizeof(struct rte_ether_hdr);
+		frag->l3_len =
+			sizeof(struct rte_ipv6_hdr) + RTE_IPV6_FRAG_HDR_SIZE;
+	}
+
+	if (fill_mode == FILL_MODE_RANDOM)
+		randomize_array_positions((void **)mbuf, nb_frags);
+}
+
+static int
+ipv6_rand_frag_pkt_setup(uint8_t fill_mode, uint8_t max_frag)
+{
+	uint8_t nb_frag;
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		nb_frag = get_rand_frags(max_frag);
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_frag_pkt_setup(uint8_t fill_mode, uint8_t nb_frag)
+{
+	int i;
+
+	for (i = 0; i < MAX_FLOWS; i++) {
+		if (rte_mempool_get_bulk(pkt_pool, (void **)mbufs[i], nb_frag) <
+		    0)
+			return TEST_FAILED;
+		ipv6_frag_fill_data(mbufs[i], nb_frag, i, fill_mode);
+		frag_per_flow[i] = nb_frag;
+	}
+	flow_cnt = i;
+
+	return TEST_SUCCESS;
+}
+
+static void
+frag_pkt_teardown(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < flow_cnt; i++)
+		rte_pktmbuf_free(mbufs[i][0]);
+}
+
+static void
+reassembly_print_stats(int8_t nb_frags, uint8_t fill_order,
+		       uint32_t outstanding, uint64_t cyc_per_flow,
+		       uint64_t cyc_per_frag_insert,
+		       uint64_t cyc_per_reassembly)
+{
+	char frag_str[8], order_str[12];
+
+	if (nb_frags > 0)
+		snprintf(frag_str, sizeof(frag_str), "%d", nb_frags);
+	else
+		snprintf(frag_str, sizeof(frag_str), "RANDOM");
+
+	switch (fill_order) {
+	case FILL_MODE_LINEAR:
+		snprintf(order_str, sizeof(order_str), "LINEAR");
+		break;
+	case FILL_MODE_RANDOM:
+		snprintf(order_str, sizeof(order_str), "RANDOM");
+		break;
+	case FILL_MODE_INTERLEAVED:
+		snprintf(order_str, sizeof(order_str), "INTERLEAVED");
+		break;
+	default:
+		break;
+	}
+
+	printf("| %-14s | %-14s | %-11d | %-11" PRIu64 " | %-22" PRIu64
+	       " | %-17" PRIu64 " |\n",
+	       order_str, frag_str, outstanding, cyc_per_flow,
+	       cyc_per_frag_insert, cyc_per_reassembly);
+	printf("+================+================+=============+=============+"
+	       "========================+===================+\n");
+}
+
+static void
+join_array(struct rte_mbuf **dest_arr, struct rte_mbuf **src_arr,
+	   uint8_t offset, uint8_t sz)
+{
+	int i, j;
+
+	for (i = offset, j = 0; j < sz; i++, j++)
+		dest_arr[i] = src_arr[j];
+}
+
+static int
+ipv4_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_interleaved_flows_perf(uint8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv4_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv4_hdr *, buf->l2_len);
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv4_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_perf(int8_t nb_frags, uint8_t fill_order)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled || buf_out->nb_segs != frag_per_flow[i])
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, 0, total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_outstanding_reassembly_perf(int8_t nb_frags, uint8_t fill_order,
+				 uint32_t outstanding)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j, k;
+
+	k = outstanding;
+	/* Insert outstanding fragments */
+	for (i = 0; k && (i < flow_cnt); i++) {
+		struct rte_mbuf *buf_out = NULL;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = frag_per_flow[i] - 1; j > 0; j--) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+			total_empty_cyc += rte_rdtsc_precise() - tstamp;
+			frag_processed++;
+
+			if (buf_out != NULL)
+				return TEST_FAILED;
+
+			k--;
+		}
+		frag_per_flow[i] = 1;
+	}
+
+	for (i = 0; i < flow_cnt; i++) {
+		struct rte_mbuf *buf_out = NULL;
+		uint8_t reassembled = 0;
+
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < frag_per_flow[i]; j++) {
+			struct rte_mbuf *buf = mbufs[i][j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled = 1;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (!reassembled)
+			return TEST_FAILED;
+		memset(mbufs[i], 0, sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+		mbufs[i][0] = buf_out;
+	}
+
+	reassembly_print_stats(nb_frags, fill_order, outstanding,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv6_reassembly_interleaved_flows_perf(int8_t nb_frags)
+{
+	struct rte_ip_frag_death_row death_row;
+	uint64_t total_reassembled_cyc = 0;
+	uint64_t total_empty_cyc = 0;
+	uint64_t tstamp, flow_tstamp;
+	uint64_t frag_processed = 0;
+	uint64_t total_cyc = 0;
+	uint32_t i, j;
+
+	for (i = 0; i < flow_cnt; i += 4) {
+		struct rte_mbuf *buf_out[4] = {NULL};
+		uint8_t reassembled = 0;
+		uint8_t nb_frags = 0;
+		uint8_t prev = 0;
+
+		for (j = 0; j < 4; j++)
+			nb_frags += frag_per_flow[i + j];
+
+		struct rte_mbuf *buf_arr[nb_frags];
+		for (j = 0; j < 4; j++) {
+			join_array(buf_arr, mbufs[i + j], prev,
+				   frag_per_flow[i + j]);
+			prev += frag_per_flow[i + j];
+		}
+		randomize_array_positions((void **)buf_arr, nb_frags);
+		flow_tstamp = rte_rdtsc_precise();
+		for (j = 0; j < nb_frags; j++) {
+			struct rte_mbuf *buf = buf_arr[j];
+			struct rte_ipv6_hdr *ip_hdr = rte_pktmbuf_mtod_offset(
+				buf, struct rte_ipv6_hdr *, buf->l2_len);
+			struct ipv6_extension_fragment *frag_hdr =
+				rte_pktmbuf_mtod_offset(
+					buf, struct ipv6_extension_fragment *,
+					buf->l2_len +
+						sizeof(struct rte_ipv6_hdr));
+
+			tstamp = rte_rdtsc_precise();
+			buf_out[reassembled] = rte_ipv6_frag_reassemble_packet(
+				frag_tbl, &death_row, buf, flow_tstamp, ip_hdr,
+				frag_hdr);
+
+			if (buf_out[reassembled] == NULL) {
+				total_empty_cyc += rte_rdtsc_precise() - tstamp;
+				frag_processed++;
+				continue;
+			} else {
+				/*Packet out*/
+				total_reassembled_cyc +=
+					rte_rdtsc_precise() - tstamp;
+				reassembled++;
+			}
+		}
+		total_cyc += rte_rdtsc_precise() - flow_tstamp;
+		if (reassembled != 4)
+			return TEST_FAILED;
+		for (j = 0; j < 4; j++) {
+			memset(mbufs[i + j], 0,
+			       sizeof(struct rte_mbuf *) * MAX_FRAGMENTS);
+			mbufs[i + j][0] = buf_out[j];
+		}
+	}
+
+	reassembly_print_stats(nb_frags, FILL_MODE_INTERLEAVED, 0,
+			       total_cyc / flow_cnt,
+			       total_empty_cyc / frag_processed,
+			       total_reassembled_cyc / flow_cnt);
+
+	return TEST_SUCCESS;
+}
+
+static int
+ipv4_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv4_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv4_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv4_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv4_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv4_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+ipv6_reassembly_test(int8_t nb_frags, uint8_t fill_order, uint32_t outstanding)
+{
+	int rc;
+
+	if (nb_frags > 0)
+		rc = ipv6_frag_pkt_setup(fill_order, nb_frags);
+	else
+		rc = ipv6_rand_frag_pkt_setup(fill_order, MAX_FRAGMENTS);
+
+	if (rc)
+		return rc;
+
+	if (outstanding)
+		rc = ipv6_outstanding_reassembly_perf(nb_frags, fill_order,
+						      outstanding);
+	else if (fill_order == FILL_MODE_INTERLEAVED)
+		rc = ipv6_reassembly_interleaved_flows_perf(nb_frags);
+	else
+		rc = ipv6_reassembly_perf(nb_frags, fill_order);
+
+	frag_pkt_teardown();
+
+	return rc;
+}
+
+static int
+test_reassembly_perf(void)
+{
+	int8_t nb_fragments[] = {2, 3, MAX_FRAGMENTS, -1 /* Random */};
+	uint8_t order_type[] = {FILL_MODE_LINEAR, FILL_MODE_RANDOM};
+	uint32_t outstanding[] = {100, 500, 1000, 2000, 3000};
+	uint32_t i, j;
+	int rc;
+
+	rc = reassembly_test_setup();
+	if (rc)
+		return rc;
+
+	reassembly_print_banner("IPV4");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv4_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv4_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv4_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	printf("\n");
+	reassembly_print_banner("IPV6");
+	/* Test variable fragment count and ordering. */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		for (j = 0; j < RTE_DIM(order_type); j++) {
+			rc = ipv6_reassembly_test(nb_fragments[i],
+						  order_type[j], 0);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Test outstanding fragments in the table. */
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(2, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < RTE_DIM(outstanding); i++) {
+		rc = ipv6_reassembly_test(MAX_FRAGMENTS, 0, outstanding[i]);
+		if (rc)
+			return rc;
+	}
+
+	/* Test interleaved flow reassembly perf */
+	for (i = 0; i < RTE_DIM(nb_fragments); i++) {
+		rc = ipv6_reassembly_test(nb_fragments[i],
+					  FILL_MODE_INTERLEAVED, 0);
+		if (rc)
+			return rc;
+	}
+	reassembly_test_teardown();
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(reassembly_perf_autotest, test_reassembly_perf);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v6 1/2] ip_frag: optimize key compare and hash generation
  2023-07-11 16:52         ` [PATCH v6 " pbhagavatula
  2023-07-11 16:52           ` [PATCH v6 2/2] test: add reassembly perf test pbhagavatula
@ 2023-07-12 14:59           ` Thomas Monjalon
  1 sibling, 0 replies; 28+ messages in thread
From: Thomas Monjalon @ 2023-07-12 14:59 UTC (permalink / raw)
  To: Konstantin Ananyev, Pavan Nikhilesh
  Cc: jerinj, Ruifeng Wang, Yipeng Wang, Sameh Gobriel,
	Bruce Richardson, Vladimir Medvedkin, dev

11/07/2023 18:52, pbhagavatula@marvell.com:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Use optimized rte_hash_k32_cmp_eq routine for key comparison for
> x86 and ARM64.
> Use CRC instructions for hash generation on ARM64.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>

Series applied, thanks.




^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-07-12 14:59 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-23 12:54 [PATCH 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
2023-05-23 12:54 ` [PATCH 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
2023-05-23 12:54 ` [PATCH 3/3] test: add reassembly perf test pbhagavatula
2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
2023-05-23 16:22     ` Honnappa Nagarahalli
2023-05-23 17:58       ` Pavan Nikhilesh Bhagavatula
2023-05-23 22:23         ` Pavan Nikhilesh Bhagavatula
2023-05-23 22:30     ` Stephen Hemminger
2023-05-29 13:17       ` [EXT] " Pavan Nikhilesh Bhagavatula
2023-05-23 14:39   ` [PATCH v2 3/3] test: add reassembly perf test pbhagavatula
2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
2023-05-29 14:55     ` [PATCH v3 2/2] test: add reassembly perf test pbhagavatula
2023-05-30 10:51       ` [EXT] " Amit Prakash Shukla
2023-05-30  3:09     ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation Stephen Hemminger
2023-05-30 17:50       ` [EXT] " Pavan Nikhilesh Bhagavatula
2023-05-30  7:44     ` Ruifeng Wang
2023-05-31  4:26     ` [PATCH v4 " pbhagavatula
2023-05-31  4:26       ` [PATCH v4 2/2] test: add reassembly perf test pbhagavatula
2023-06-05 11:12         ` Константин Ананьев
2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
2023-06-02 17:01         ` [PATCH v5 2/2] test: add reassembly perf test pbhagavatula
2023-06-27  9:36           ` Konstantin Ananyev
2023-06-05 11:09         ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation Константин Ананьев
2023-06-27  9:23         ` Konstantin Ananyev
2023-07-11 16:52         ` [PATCH v6 " pbhagavatula
2023-07-11 16:52           ` [PATCH v6 2/2] test: add reassembly perf test pbhagavatula
2023-07-12 14:59           ` [PATCH v6 1/2] ip_frag: optimize key compare and hash generation Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).