From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id ABF664237C; Mon, 9 Jan 2023 10:28:24 +0100 (CET) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 6279640689; Mon, 9 Jan 2023 10:28:24 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by mails.dpdk.org (Postfix) with ESMTP id E80D340687 for ; Mon, 9 Jan 2023 10:28:22 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 578BA1042; Mon, 9 Jan 2023 01:29:04 -0800 (PST) Received: from net-arm-n1amp-02.shanghai.arm.com (net-arm-n1amp-02.shanghai.arm.com [10.169.210.108]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 860573F67D; Mon, 9 Jan 2023 01:28:19 -0800 (PST) From: Ruifeng Wang To: bruce.richardson@intel.com, vladimir.medvedkin@intel.com, rbhansali@marvell.com, fengchengwen@huawei.com Cc: dev@dpdk.org, jerinj@marvell.com, honnappa.nagarahalli@arm.com, nd@arm.com, Ruifeng Wang Subject: [PATCH] lpm: use SVE for bulk lookup Date: Mon, 9 Jan 2023 17:28:07 +0800 Message-Id: <20230109092807.2813215-1-ruifeng.wang@arm.com> X-Mailer: git-send-email 2.25.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org SVE was used for lookupx4 where four entries are looked up at a time. It is not an ideal case for SVE implementation which does not bound to vector length. Changed to use SVE implementation in bulk lookup when the feature is available. And optimized the SVE implementation. The lookupx4 sticks to NEON implementation. Signed-off-by: Ruifeng Wang --- lib/lpm/rte_lpm.h | 6 ++- lib/lpm/rte_lpm_sve.h | 96 +++++++++++++++++++++++++------------------ 2 files changed, 61 insertions(+), 41 deletions(-) diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h index 09ad307869..75e27ff164 100644 --- a/lib/lpm/rte_lpm.h +++ b/lib/lpm/rte_lpm.h @@ -400,9 +400,11 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], #if defined(RTE_ARCH_ARM) #ifdef RTE_HAS_SVE_ACLE #include "rte_lpm_sve.h" -#else -#include "rte_lpm_neon.h" +#undef rte_lpm_lookup_bulk +#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ + __rte_lpm_lookup_vec(lpm, ips, next_hops, n) #endif +#include "rte_lpm_neon.h" #elif defined(RTE_ARCH_PPC_64) #include "rte_lpm_altivec.h" #elif defined(RTE_ARCH_X86) diff --git a/lib/lpm/rte_lpm_sve.h b/lib/lpm/rte_lpm_sve.h index 94ead70c39..52b8bd4940 100644 --- a/lib/lpm/rte_lpm_sve.h +++ b/lib/lpm/rte_lpm_sve.h @@ -12,18 +12,21 @@ extern "C" { #endif -__rte_internal -static void +static inline int __rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips, uint32_t *__rte_restrict next_hops, const uint32_t n) { - uint32_t i = 0; - svuint32_t v_ip, v_idx, v_tbl24, v_tbl8, v_hop; - svuint32_t v_mask_xv, v_mask_v, v_mask_hop; - svbool_t pg = svwhilelt_b32(i, n); + uint32_t i; + uint64_t vl = svcntw(); + svuint32_t v_ip, v_idx, v_tbl24, v_tbl8; + svuint32_t v_mask_xv, v_mask_v; + svbool_t pg = svptrue_b32(); svbool_t pv; - do { + for (i = 0; i < n; i++) + next_hops[i] = 0; + + for (i = 0; i < n - vl; i += vl) { v_ip = svld1(pg, &ips[i]); /* Get indices for tbl24[] */ v_idx = svlsr_x(pg, v_ip, 8); @@ -37,46 +40,61 @@ __rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips, v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK); /* Create predicate for tbl24 entries: (valid && !valid_group) */ pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v); - /* Create mask for next_hop in table entry */ - v_mask_hop = svdup_u32_z(pg, 0x00ffffff); - /* Extract next_hop and write back */ - v_hop = svand_x(pv, v_tbl24, v_mask_hop); - svst1(pv, &next_hops[i], v_hop); + svst1(pv, &next_hops[i], v_tbl24); /* Update predicate for tbl24 entries: (valid && valid_group) */ pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv); - /* Compute tbl8 index */ - v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff)); - v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES); - v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)), - v_idx); - /* Extract values from tbl8[] */ - v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8, - v_idx); - /* Update predicate for tbl8 entries: (valid) */ - pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v); - /* Extract next_hop and write back */ - v_hop = svand_x(pv, v_tbl8, v_mask_hop); - svst1(pv, &next_hops[i], v_hop); + if (svptest_any(pg, pv)) { + /* Compute tbl8 index */ + v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff)); + v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)), + v_idx); + /* Extract values from tbl8[] */ + v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8, + v_idx); + /* Update predicate for tbl8 entries: (valid) */ + pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v); + svst1(pv, &next_hops[i], v_tbl8); + } + } - i += svlen(v_ip); - pg = svwhilelt_b32(i, n); - } while (svptest_any(svptrue_b32(), pg)); -} + pg = svwhilelt_b32(i, n); + if (svptest_any(svptrue_b32(), pg)) { + v_ip = svld1(pg, &ips[i]); + /* Get indices for tbl24[] */ + v_idx = svlsr_x(pg, v_ip, 8); + /* Extract values from tbl24[] */ + v_tbl24 = svld1_gather_index(pg, (const uint32_t *)lpm->tbl24, + v_idx); -static inline void -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], - uint32_t defv) -{ - uint32_t i, ips[4]; + /* Create mask with valid set */ + v_mask_v = svdup_u32_z(pg, RTE_LPM_LOOKUP_SUCCESS); + /* Create mask with valid and valid_group set */ + v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK); + /* Create predicate for tbl24 entries: (valid && !valid_group) */ + pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v); + svst1(pv, &next_hops[i], v_tbl24); - vst1q_s32((int32_t *)ips, ip); - for (i = 0; i < 4; i++) - hop[i] = defv; + /* Update predicate for tbl24 entries: (valid && valid_group) */ + pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv); + if (svptest_any(pg, pv)) { + /* Compute tbl8 index */ + v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff)); + v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)), + v_idx); + /* Extract values from tbl8[] */ + v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8, + v_idx); + /* Update predicate for tbl8 entries: (valid) */ + pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v); + svst1(pv, &next_hops[i], v_tbl8); + } + } - __rte_lpm_lookup_vec(lpm, ips, hop, 4); + return 0; } - #ifdef __cplusplus } #endif -- 2.25.1