From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id A359042B81; Tue, 23 May 2023 15:03:02 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 90C6F40A80; Tue, 23 May 2023 15:03:02 +0200 (CEST) Received: from mx0b-0016f401.pphosted.com (mx0b-0016f401.pphosted.com [67.231.156.173]) by mails.dpdk.org (Postfix) with ESMTP id 53D2640689 for ; Tue, 23 May 2023 15:03:00 +0200 (CEST) Received: from pps.filterd (m0045851.ppops.net [127.0.0.1]) by mx0b-0016f401.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id 34NBu1HW009659; Tue, 23 May 2023 06:02:48 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=marvell.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=pfpt0220; bh=8vP5Mr6b3I+3N3QZO9aIMK/x3u3HQuBm4u7U2GsVu+o=; b=jPHMUtf32uLoZV2m9tysw/I+aYx65TLjJMjvig9JdBEObzdFx8Zko0zhWxGOT3S6gYdi 9/RINnY25K43/i4FJPJLQVBLYC5BNMmtd3A29TrwvR6hXi4unSX8L+TfkIukhgA8BAar 4nihrOZQF54C9AwbxHXqtW0cjRDAkZFsHM2cEoUk9wwJCOw0+z5p80zMOOrSYvvIlKO+ FuP0P8VgvjxEBUZnIISh2SQIWO7dThwQ0NM/WNz778ST+J9horY31TMGKtrzcTIwXlaZ PgfLrO/Xonjgq7MT2ut4402CnR2/izq5Spgxis+geO3CbKLskrrnXoQPMNziR5BlM1Kr 5A== Received: from dc5-exch02.marvell.com ([199.233.59.182]) by mx0b-0016f401.pphosted.com (PPS) with ESMTPS id 3qpwqk1eay-3 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384 bits=256 verify=NOT); Tue, 23 May 2023 06:02:03 -0700 Received: from DC5-EXCH02.marvell.com (10.69.176.39) by DC5-EXCH02.marvell.com (10.69.176.39) with Microsoft SMTP Server (TLS) id 15.0.1497.48; Tue, 23 May 2023 06:01:50 -0700 Received: from maili.marvell.com (10.69.176.80) by DC5-EXCH02.marvell.com (10.69.176.39) with Microsoft SMTP Server id 15.0.1497.48 via Frontend Transport; Tue, 23 May 2023 06:00:55 -0700 Received: from MININT-80QBFE8.corp.innovium.com (unknown [10.28.164.122]) by maili.marvell.com (Postfix) with ESMTP id 70BEA5B6A3D; Tue, 23 May 2023 05:54:33 -0700 (PDT) From: To: , , , Konstantin Ananyev CC: , Pavan Nikhilesh Subject: [PATCH 2/3] ip_frag: improve reassembly lookup performance Date: Tue, 23 May 2023 18:24:12 +0530 Message-ID: <20230523125413.6324-2-pbhagavatula@marvell.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20230523125413.6324-1-pbhagavatula@marvell.com> References: <20230523125413.6324-1-pbhagavatula@marvell.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain X-Proofpoint-ORIG-GUID: XhGFXzVD4yEiNVVgbLTzR08TiRyFhin0 X-Proofpoint-GUID: XhGFXzVD4yEiNVVgbLTzR08TiRyFhin0 X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.254,Aquarius:18.0.957,Hydra:6.0.573,FMLib:17.11.176.26 definitions=2023-05-23_08,2023-05-23_02,2023-05-22_02 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org From: Pavan Nikhilesh Improve reassembly lookup performance by using NEON intrinsics for key validation. Signed-off-by: Pavan Nikhilesh --- lib/ip_frag/ip_frag_internal.c | 224 +++++++++++++++++++++++++------ lib/ip_frag/ip_reassembly.h | 6 + lib/ip_frag/rte_ip_frag_common.c | 10 ++ 3 files changed, 196 insertions(+), 44 deletions(-) diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c index 7cbef647df..de78a0ed8f 100644 --- a/lib/ip_frag/ip_frag_internal.c +++ b/lib/ip_frag/ip_frag_internal.c @@ -4,8 +4,9 @@ #include -#include #include +#include +#include #include "ip_frag_common.h" @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr, return pkt; } -struct ip_frag_pkt * -ip_frag_lookup(struct rte_ip_frag_tbl *tbl, - const struct ip_frag_key *key, uint64_t tms, - struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +static inline void +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p, + uint32_t list_idx, uint32_t list_cnt) +{ + RTE_SET_USED(tbl); + RTE_SET_USED(list_idx); + RTE_SET_USED(list_cnt); + if (p->key.key_len == IPV4_KEYLEN) + IP_FRAG_LOG(DEBUG, + "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv4_frag_pkt line0: %p, index: %u from %u\n" + "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, tbl, tbl->max_entries, + tbl->use_entries, p, list_idx, list_cnt, + p->key.src_dst[0], p->key.id, p->start); + else + IP_FRAG_LOG(DEBUG, + "%s:%d:\n" + "tbl: %p, max_entries: %u, use_entries: %u\n" + "ipv6_frag_pkt line0: %p, index: %u from %u\n" + "key: <" IPv6_KEY_BYTES_FMT + ", %#x>, start: %" PRIu64 "\n", + __func__, __LINE__, tbl, tbl->max_entries, + tbl->use_entries, p, list_idx, list_cnt, + IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id, + p->start); +} + +#if defined(RTE_ARCH_ARM64) +static inline struct ip_frag_pkt * +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +{ + struct ip_frag_pkt *empty, *old; + struct ip_frag_pkt *p1, *p2; + uint32_t assoc, sig1, sig2; + uint64_t max_cycles; + + empty = NULL; + old = NULL; + + max_cycles = tbl->max_cycles; + assoc = tbl->bucket_entries; + + if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0) + return tbl->last; + + /* different hashing methods for IPv4 and IPv6 */ + if (key->key_len == IPV4_KEYLEN) + ipv4_frag_hash(key, &sig1, &sig2); + else + ipv6_frag_hash(key, &sig1, &sig2); + + p1 = IP_FRAG_TBL_POS(tbl, sig1); + p2 = IP_FRAG_TBL_POS(tbl, sig2); + + uint64x2_t key0, key1, key2, key3; + uint64_t vmask, zmask, ts_mask; + uint64x2_t ts0, ts1; + uint32x4_t nz_key; + uint8_t idx; + /* Bucket entries are always power of 2. */ + rte_prefetch0(&p1[0].key); + rte_prefetch0(&p1[1].key); + rte_prefetch0(&p2[0].key); + rte_prefetch0(&p2[1].key); + + while (assoc > 1) { + if (assoc > 2) { + rte_prefetch0(&p1[2].key); + rte_prefetch0(&p1[3].key); + rte_prefetch0(&p2[2].key); + rte_prefetch0(&p2[3].key); + } + struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]}; + key0 = vld1q_u64(&p[0]->key.id_key_len); + key1 = vld1q_u64(&p[1]->key.id_key_len); + key2 = vld1q_u64(&p[2]->key.id_key_len); + key3 = vld1q_u64(&p[3]->key.id_key_len); + + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0); + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1); + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2); + nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3), 1), nz_key, 3); + + nz_key = vceqzq_u32(nz_key); + zmask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0); + vmask = ~zmask; + + vmask &= 0x8000800080008000; + for (; vmask > 0; vmask &= vmask - 1) { + idx = __builtin_ctzll(vmask) >> 4; + if (ip_frag_key_cmp(key, &p[idx]->key) == 0) + return p[idx]; + } + + vmask = ~zmask; + if (zmask && empty == NULL) { + zmask &= 0x8000800080008000; + idx = __builtin_ctzll(zmask) >> 4; + empty = p[idx]; + } + + if (vmask && old == NULL) { + const uint64x2_t max_cyc = vdupq_n_u64(max_cycles); + const uint64x2_t cur_cyc = vdupq_n_u64(tms); + + ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0, 0); + ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0, 1); + ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1, 0); + ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1, 1); + + ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc)); + ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc)); + + ts_mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32( + vuzp1q_u32(vreinterpretq_u32_u64(ts0), + vreinterpretq_u32_u64(ts1)), + 16)), + 0); + vmask &= 0x8000800080008000; + ts_mask &= vmask; + if (ts_mask) { + idx = __builtin_ctzll(ts_mask) >> 4; + old = p[idx]; + } + } + p1 += 2; + p2 += 2; + assoc -= 4; + } + while (assoc) { + if (ip_frag_key_cmp(key, &p1->key) == 0) + return p1; + else if (ip_frag_key_is_empty(&p1->key)) + empty = (empty == NULL) ? p1 : empty; + else if (max_cycles + p1->start < tms) + old = (old == NULL) ? p1 : old; + + if (ip_frag_key_cmp(key, &p2->key) == 0) + return p2; + else if (ip_frag_key_is_empty(&p2->key)) + empty = (empty == NULL) ? p2 : empty; + else if (max_cycles + p2->start < tms) + old = (old == NULL) ? p2 : old; + p1++; + p2++; + assoc--; + } + + *free = empty; + *stale = old; + return NULL; +} +#endif + +static struct ip_frag_pkt * +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) { struct ip_frag_pkt *p1, *p2; struct ip_frag_pkt *empty, *old; @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl, p2 = IP_FRAG_TBL_POS(tbl, sig2); for (i = 0; i != assoc; i++) { - if (p1->key.key_len == IPV4_KEYLEN) - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv4_frag_pkt line0: %p, index: %u from %u\n" - "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p1, i, assoc, - p1[i].key.src_dst[0], p1[i].key.id, p1[i].start); - else - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv6_frag_pkt line0: %p, index: %u from %u\n" - "key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p1, i, assoc, - IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start); - + ip_frag_dbg(tbl, &p1[i], i, assoc); if (ip_frag_key_cmp(key, &p1[i].key) == 0) return p1 + i; else if (ip_frag_key_is_empty(&p1[i].key)) @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl, else if (max_cycles + p1[i].start < tms) old = (old == NULL) ? (p1 + i) : old; - if (p2->key.key_len == IPV4_KEYLEN) - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv4_frag_pkt line1: %p, index: %u from %u\n" - "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p2, i, assoc, - p2[i].key.src_dst[0], p2[i].key.id, p2[i].start); - else - IP_FRAG_LOG(DEBUG, "%s:%d:\n" - "tbl: %p, max_entries: %u, use_entries: %u\n" - "ipv6_frag_pkt line1: %p, index: %u from %u\n" - "key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n", - __func__, __LINE__, - tbl, tbl->max_entries, tbl->use_entries, - p2, i, assoc, - IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start); - + ip_frag_dbg(tbl, &p2[i], i, assoc); if (ip_frag_key_cmp(key, &p2[i].key) == 0) return p2 + i; else if (ip_frag_key_is_empty(&p2[i].key)) - empty = (empty == NULL) ?( p2 + i) : empty; + empty = (empty == NULL) ? (p2 + i) : empty; else if (max_cycles + p2[i].start < tms) old = (old == NULL) ? (p2 + i) : old; } @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl, *stale = old; return NULL; } + +struct ip_frag_pkt * +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms, + struct ip_frag_pkt **free, struct ip_frag_pkt **stale) +{ + switch (tbl->lookup_fn) { +#if defined(RTE_ARCH_ARM64) + case REASSEMBLY_LOOKUP_NEON: + return ip_frag_lookup_neon(tbl, key, tms, free, stale); +#endif + case REASSEMBLY_LOOKUP_SCALAR: + default: + return ip_frag_lookup_scalar(tbl, key, tms, free, stale); + } +} diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index ef9d8c0d75..049437ae32 100644 --- a/lib/ip_frag/ip_reassembly.h +++ b/lib/ip_frag/ip_reassembly.h @@ -12,6 +12,11 @@ #include +enum ip_frag_lookup_func { + REASSEMBLY_LOOKUP_SCALAR = 0, + REASSEMBLY_LOOKUP_NEON, +}; + enum { IP_LAST_FRAG_IDX, /* index of last fragment */ IP_FIRST_FRAG_IDX, /* index of first fragment */ @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl { struct ip_frag_pkt *last; /* last used entry. */ struct ip_pkt_list lru; /* LRU list for table entries. */ struct ip_frag_tbl_stat stat; /* statistics counters. */ + enum ip_frag_lookup_func lookup_fn; /* hash table lookup function. */ __extension__ struct ip_frag_pkt pkt[]; /* hash table. */ }; diff --git a/lib/ip_frag/rte_ip_frag_common.c b/lib/ip_frag/rte_ip_frag_common.c index c1de2e81b6..ef3c104e45 100644 --- a/lib/ip_frag/rte_ip_frag_common.c +++ b/lib/ip_frag/rte_ip_frag_common.c @@ -5,7 +5,9 @@ #include #include +#include #include +#include #include "ip_frag_common.h" @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries, tbl->bucket_entries = bucket_entries; tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries - 1); +#if defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) + tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON; + else +#endif + tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR; + TAILQ_INIT(&(tbl->lru)); return tbl; } -- 2.39.1