DPDK patches and discussions
 help / color / mirror / Atom feed
From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
To: "pbhagavatula@marvell.com" <pbhagavatula@marvell.com>,
	"jerinj@marvell.com" <jerinj@marvell.com>, nd <nd@arm.com>,
	Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
Cc: "dev@dpdk.org" <dev@dpdk.org>, nd <nd@arm.com>, nd <nd@arm.com>
Subject: RE: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
Date: Tue, 23 May 2023 16:22:55 +0000	[thread overview]
Message-ID: <DBAPR08MB5814890A343C28626EF9A86D98409@DBAPR08MB5814.eurprd08.prod.outlook.com> (raw)
In-Reply-To: <20230523143921.7420-2-pbhagavatula@marvell.com>



> -----Original Message-----
> From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> Sent: Tuesday, May 23, 2023 9:39 AM
> To: jerinj@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin Ananyev
> <konstantin.v.ananyev@yandex.ru>
> Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
> 
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Improve reassembly lookup performance by using NEON intrinsics for key
> validation.
What is the improvement do you see with this?

> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
>  lib/ip_frag/ip_reassembly.h      |   6 +
>  lib/ip_frag/rte_ip_frag_common.c |  10 ++
>  3 files changed, 196 insertions(+), 44 deletions(-)
> 
> diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c index
> 7cbef647df..de78a0ed8f 100644
> --- a/lib/ip_frag/ip_frag_internal.c
> +++ b/lib/ip_frag/ip_frag_internal.c
> @@ -4,8 +4,9 @@
> 
>  #include <stddef.h>
> 
> -#include <rte_jhash.h>
>  #include <rte_hash_crc.h>
> +#include <rte_jhash.h>
> +#include <rte_vect.h>
> 
>  #include "ip_frag_common.h"
> 
> @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> rte_ip_frag_death_row *dr,
>  	return pkt;
>  }
> 
> -struct ip_frag_pkt *
> -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> -	const struct ip_frag_key *key, uint64_t tms,
> -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> +static inline void
> +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> +	    uint32_t list_idx, uint32_t list_cnt) {
> +	RTE_SET_USED(tbl);
> +	RTE_SET_USED(list_idx);
> +	RTE_SET_USED(list_cnt);
> +	if (p->key.key_len == IPV4_KEYLEN)
> +		IP_FRAG_LOG(DEBUG,
> +			    "%s:%d:\n"
> +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> +			    __func__, __LINE__, tbl, tbl->max_entries,
> +			    tbl->use_entries, p, list_idx, list_cnt,
> +			    p->key.src_dst[0], p->key.id, p->start);
> +	else
> +		IP_FRAG_LOG(DEBUG,
> +			    "%s:%d:\n"
> +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> +			    "key: <" IPv6_KEY_BYTES_FMT
> +			    ", %#x>, start: %" PRIu64 "\n",
> +			    __func__, __LINE__, tbl, tbl->max_entries,
> +			    tbl->use_entries, p, list_idx, list_cnt,
> +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> +			    p->start);
> +}
> +
> +#if defined(RTE_ARCH_ARM64)
> +static inline struct ip_frag_pkt *
> +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key, uint64_t tms,
> +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> +	struct ip_frag_pkt *empty, *old;
> +	struct ip_frag_pkt *p1, *p2;
> +	uint32_t assoc, sig1, sig2;
> +	uint64_t max_cycles;
> +
> +	empty = NULL;
> +	old = NULL;
> +
> +	max_cycles = tbl->max_cycles;
> +	assoc = tbl->bucket_entries;
> +
> +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> +		return tbl->last;
> +
> +	/* different hashing methods for IPv4 and IPv6 */
> +	if (key->key_len == IPV4_KEYLEN)
> +		ipv4_frag_hash(key, &sig1, &sig2);
> +	else
> +		ipv6_frag_hash(key, &sig1, &sig2);
> +
> +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> +
> +	uint64x2_t key0, key1, key2, key3;
> +	uint64_t vmask, zmask, ts_mask;
> +	uint64x2_t ts0, ts1;
> +	uint32x4_t nz_key;
> +	uint8_t idx;
> +	/* Bucket entries are always power of 2. */
> +	rte_prefetch0(&p1[0].key);
> +	rte_prefetch0(&p1[1].key);
> +	rte_prefetch0(&p2[0].key);
> +	rte_prefetch0(&p2[1].key);
> +
> +	while (assoc > 1) {
> +		if (assoc > 2) {
> +			rte_prefetch0(&p1[2].key);
> +			rte_prefetch0(&p1[3].key);
> +			rte_prefetch0(&p2[2].key);
> +			rte_prefetch0(&p2[3].key);
> +		}
> +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> +
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> +1), nz_key, 3);
> +
> +		nz_key = vceqzq_u32(nz_key);
> +		zmask =
> vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> +		vmask = ~zmask;
> +
> +		vmask &= 0x8000800080008000;
> +		for (; vmask > 0; vmask &= vmask - 1) {
> +			idx = __builtin_ctzll(vmask) >> 4;
> +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> +				return p[idx];
> +		}
> +
> +		vmask = ~zmask;
> +		if (zmask && empty == NULL) {
> +			zmask &= 0x8000800080008000;
> +			idx = __builtin_ctzll(zmask) >> 4;
> +			empty = p[idx];
> +		}
> +
> +		if (vmask && old == NULL) {
> +			const uint64x2_t max_cyc =
> vdupq_n_u64(max_cycles);
> +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> +
> +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> 0);
> +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> 1);
> +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> 0);
> +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> 1);
> +
> +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> +
> +			ts_mask =
> vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> +
> 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> +
> vreinterpretq_u32_u64(ts1)),
> +							16)),
> +						0);
> +			vmask &= 0x8000800080008000;
> +			ts_mask &= vmask;
> +			if (ts_mask) {
> +				idx = __builtin_ctzll(ts_mask) >> 4;
> +				old = p[idx];
> +			}
> +		}
> +		p1 += 2;
> +		p2 += 2;
> +		assoc -= 4;
> +	}
> +	while (assoc) {
> +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> +			return p1;
> +		else if (ip_frag_key_is_empty(&p1->key))
> +			empty = (empty == NULL) ? p1 : empty;
> +		else if (max_cycles + p1->start < tms)
> +			old = (old == NULL) ? p1 : old;
> +
> +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> +			return p2;
> +		else if (ip_frag_key_is_empty(&p2->key))
> +			empty = (empty == NULL) ? p2 : empty;
> +		else if (max_cycles + p2->start < tms)
> +			old = (old == NULL) ? p2 : old;
> +		p1++;
> +		p2++;
> +		assoc--;
> +	}
> +
> +	*free = empty;
> +	*stale = old;
> +	return NULL;
> +}
> +#endif
> +
> +static struct ip_frag_pkt *
> +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key, uint64_t tms,
> +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
>  {
>  	struct ip_frag_pkt *p1, *p2;
>  	struct ip_frag_pkt *empty, *old;
> @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> 
>  	for (i = 0; i != assoc; i++) {
> -		if (p1->key.key_len == IPV4_KEYLEN)
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv4_frag_pkt line0: %p, index: %u
> from %u\n"
> -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p1, i, assoc,
> -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> -		else
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv6_frag_pkt line0: %p, index: %u
> from %u\n"
> -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p1, i, assoc,
> -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> p1[i].start);
> -
> +		ip_frag_dbg(tbl, &p1[i], i, assoc);
>  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
>  			return p1 + i;
>  		else if (ip_frag_key_is_empty(&p1[i].key))
> @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  		else if (max_cycles + p1[i].start < tms)
>  			old = (old == NULL) ? (p1 + i) : old;
> 
> -		if (p2->key.key_len == IPV4_KEYLEN)
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv4_frag_pkt line1: %p, index: %u
> from %u\n"
> -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p2, i, assoc,
> -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> -		else
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv6_frag_pkt line1: %p, index: %u
> from %u\n"
> -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p2, i, assoc,
> -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> p2[i].start);
> -
> +		ip_frag_dbg(tbl, &p2[i], i, assoc);
>  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
>  			return p2 + i;
>  		else if (ip_frag_key_is_empty(&p2[i].key))
> -			empty = (empty == NULL) ?( p2 + i) : empty;
> +			empty = (empty == NULL) ? (p2 + i) : empty;
>  		else if (max_cycles + p2[i].start < tms)
>  			old = (old == NULL) ? (p2 + i) : old;
>  	}
> @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  	*stale = old;
>  	return NULL;
>  }
> +
> +struct ip_frag_pkt *
> +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key,
> uint64_t tms,
> +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> +	switch (tbl->lookup_fn) {
> +#if defined(RTE_ARCH_ARM64)
> +	case REASSEMBLY_LOOKUP_NEON:
> +		return ip_frag_lookup_neon(tbl, key, tms, free, stale); #endif
> +	case REASSEMBLY_LOOKUP_SCALAR:
> +	default:
> +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> +	}
> +}
> diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index
> ef9d8c0d75..049437ae32 100644
> --- a/lib/ip_frag/ip_reassembly.h
> +++ b/lib/ip_frag/ip_reassembly.h
> @@ -12,6 +12,11 @@
> 
>  #include <rte_ip_frag.h>
> 
> +enum ip_frag_lookup_func {
> +	REASSEMBLY_LOOKUP_SCALAR = 0,
> +	REASSEMBLY_LOOKUP_NEON,
> +};
> +
>  enum {
>  	IP_LAST_FRAG_IDX,    /* index of last fragment */
>  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
>  	struct ip_frag_pkt *last;     /* last used entry. */
>  	struct ip_pkt_list lru;       /* LRU list for table entries. */
>  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function.
> */
>  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> 
> diff --git a/lib/ip_frag/rte_ip_frag_common.c
> b/lib/ip_frag/rte_ip_frag_common.c
> index c1de2e81b6..ef3c104e45 100644
> --- a/lib/ip_frag/rte_ip_frag_common.c
> +++ b/lib/ip_frag/rte_ip_frag_common.c
> @@ -5,7 +5,9 @@
>  #include <stddef.h>
>  #include <stdio.h>
> 
> +#include <rte_cpuflags.h>
>  #include <rte_log.h>
> +#include <rte_vect.h>
> 
>  #include "ip_frag_common.h"
> 
> @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> uint32_t bucket_entries,
>  	tbl->bucket_entries = bucket_entries;
>  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> 
> +#if defined(RTE_ARCH_ARM64)
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> +	else
> +#endif
> +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> +
>  	TAILQ_INIT(&(tbl->lru));
>  	return tbl;
>  }
> --
> 2.25.1


  reply	other threads:[~2023-05-23 16:23 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-23 12:54 [PATCH 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
2023-05-23 12:54 ` [PATCH 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
2023-05-23 12:54 ` [PATCH 3/3] test: add reassembly perf test pbhagavatula
2023-05-23 14:39 ` [PATCH v2 1/3] ip_frag: optimize key compare and hash generation pbhagavatula
2023-05-23 14:39   ` [PATCH v2 2/3] ip_frag: improve reassembly lookup performance pbhagavatula
2023-05-23 16:22     ` Honnappa Nagarahalli [this message]
2023-05-23 17:58       ` Pavan Nikhilesh Bhagavatula
2023-05-23 22:23         ` Pavan Nikhilesh Bhagavatula
2023-05-23 22:30     ` Stephen Hemminger
2023-05-29 13:17       ` [EXT] " Pavan Nikhilesh Bhagavatula
2023-05-23 14:39   ` [PATCH v2 3/3] test: add reassembly perf test pbhagavatula
2023-05-29 14:55   ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
2023-05-29 14:55     ` [PATCH v3 2/2] test: add reassembly perf test pbhagavatula
2023-05-30 10:51       ` [EXT] " Amit Prakash Shukla
2023-05-30  3:09     ` [PATCH v3 1/2] ip_frag: optimize key compare and hash generation Stephen Hemminger
2023-05-30 17:50       ` [EXT] " Pavan Nikhilesh Bhagavatula
2023-05-30  7:44     ` Ruifeng Wang
2023-05-31  4:26     ` [PATCH v4 " pbhagavatula
2023-05-31  4:26       ` [PATCH v4 2/2] test: add reassembly perf test pbhagavatula
2023-06-05 11:12         ` Константин Ананьев
2023-06-02 17:01       ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation pbhagavatula
2023-06-02 17:01         ` [PATCH v5 2/2] test: add reassembly perf test pbhagavatula
2023-06-27  9:36           ` Konstantin Ananyev
2023-06-05 11:09         ` [PATCH v5 1/2] ip_frag: optimize key compare and hash generation Константин Ананьев
2023-06-27  9:23         ` Konstantin Ananyev
2023-07-11 16:52         ` [PATCH v6 " pbhagavatula
2023-07-11 16:52           ` [PATCH v6 2/2] test: add reassembly perf test pbhagavatula
2023-07-12 14:59           ` [PATCH v6 1/2] ip_frag: optimize key compare and hash generation Thomas Monjalon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=DBAPR08MB5814890A343C28626EF9A86D98409@DBAPR08MB5814.eurprd08.prod.outlook.com \
    --to=honnappa.nagarahalli@arm.com \
    --cc=dev@dpdk.org \
    --cc=jerinj@marvell.com \
    --cc=konstantin.v.ananyev@yandex.ru \
    --cc=nd@arm.com \
    --cc=pbhagavatula@marvell.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).