[PATCH] lpm: use SVE for bulk lookup

DPDK patches and discussions
 help / color / mirror / Atom feed

From: Ruifeng Wang <ruifeng.wang@arm.com>
To: bruce.richardson@intel.com, vladimir.medvedkin@intel.com,
	rbhansali@marvell.com, fengchengwen@huawei.com
Cc: dev@dpdk.org, jerinj@marvell.com, honnappa.nagarahalli@arm.com,
	nd@arm.com, Ruifeng Wang <ruifeng.wang@arm.com>
Subject: [PATCH] lpm: use SVE for bulk lookup
Date: Mon,  9 Jan 2023 17:28:07 +0800	[thread overview]
Message-ID: <20230109092807.2813215-1-ruifeng.wang@arm.com> (raw)

SVE was used for lookupx4 where four entries are looked up at a time.
It is not an ideal case for SVE implementation which does not bound
to vector length.
Changed to use SVE implementation in bulk lookup when the feature is
available. And optimized the SVE implementation.
The lookupx4 sticks to NEON implementation.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/lpm/rte_lpm.h     |  6 ++-
 lib/lpm/rte_lpm_sve.h | 96 +++++++++++++++++++++++++------------------
 2 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index 09ad307869..75e27ff164 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -400,9 +400,11 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #if defined(RTE_ARCH_ARM)
 #ifdef RTE_HAS_SVE_ACLE
 #include "rte_lpm_sve.h"
-#else
-#include "rte_lpm_neon.h"
+#undef rte_lpm_lookup_bulk
+#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \
+		__rte_lpm_lookup_vec(lpm, ips, next_hops, n)
 #endif
+#include "rte_lpm_neon.h"
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
 #elif defined(RTE_ARCH_X86)
diff --git a/lib/lpm/rte_lpm_sve.h b/lib/lpm/rte_lpm_sve.h
index 94ead70c39..52b8bd4940 100644
--- a/lib/lpm/rte_lpm_sve.h
+++ b/lib/lpm/rte_lpm_sve.h
@@ -12,18 +12,21 @@
 extern "C" {
 #endif
 
-__rte_internal
-static void
+static inline int
 __rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
 		uint32_t *__rte_restrict next_hops, const uint32_t n)
 {
-	uint32_t i = 0;
-	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8, v_hop;
-	svuint32_t v_mask_xv, v_mask_v, v_mask_hop;
-	svbool_t pg = svwhilelt_b32(i, n);
+	uint32_t i;
+	uint64_t vl = svcntw();
+	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8;
+	svuint32_t v_mask_xv, v_mask_v;
+	svbool_t pg = svptrue_b32();
 	svbool_t pv;
 
-	do {
+	for (i = 0; i < n; i++)
+		next_hops[i] = 0;
+
+	for (i = 0; i < n - vl; i += vl) {
 		v_ip = svld1(pg, &ips[i]);
 		/* Get indices for tbl24[] */
 		v_idx = svlsr_x(pg, v_ip, 8);
@@ -37,46 +40,61 @@ __rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
 		v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK);
 		/* Create predicate for tbl24 entries: (valid && !valid_group) */
 		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v);
-		/* Create mask for next_hop in table entry */
-		v_mask_hop = svdup_u32_z(pg, 0x00ffffff);
-		/* Extract next_hop and write back */
-		v_hop = svand_x(pv, v_tbl24, v_mask_hop);
-		svst1(pv, &next_hops[i], v_hop);
+		svst1(pv, &next_hops[i], v_tbl24);
 
 		/* Update predicate for tbl24 entries: (valid && valid_group) */
 		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv);
-		/* Compute tbl8 index */
-		v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
-		v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
-		v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
-				v_idx);
-		/* Extract values from tbl8[] */
-		v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
-						v_idx);
-		/* Update predicate for tbl8 entries: (valid) */
-		pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
-		/* Extract next_hop and write back */
-		v_hop = svand_x(pv, v_tbl8, v_mask_hop);
-		svst1(pv, &next_hops[i], v_hop);
+		if (svptest_any(pg, pv)) {
+			/* Compute tbl8 index */
+			v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
+			v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+			v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
+					v_idx);
+			/* Extract values from tbl8[] */
+			v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
+							v_idx);
+			/* Update predicate for tbl8 entries: (valid) */
+			pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
+			svst1(pv, &next_hops[i], v_tbl8);
+		}
+	}
 
-		i += svlen(v_ip);
-		pg = svwhilelt_b32(i, n);
-	} while (svptest_any(svptrue_b32(), pg));
-}
+	pg = svwhilelt_b32(i, n);
+	if (svptest_any(svptrue_b32(), pg)) {
+		v_ip = svld1(pg, &ips[i]);
+		/* Get indices for tbl24[] */
+		v_idx = svlsr_x(pg, v_ip, 8);
+		/* Extract values from tbl24[] */
+		v_tbl24 = svld1_gather_index(pg, (const uint32_t *)lpm->tbl24,
+						v_idx);
 
-static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
-		uint32_t defv)
-{
-	uint32_t i, ips[4];
+		/* Create mask with valid set */
+		v_mask_v = svdup_u32_z(pg, RTE_LPM_LOOKUP_SUCCESS);
+		/* Create mask with valid and valid_group set */
+		v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK);
+		/* Create predicate for tbl24 entries: (valid && !valid_group) */
+		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v);
+		svst1(pv, &next_hops[i], v_tbl24);
 
-	vst1q_s32((int32_t *)ips, ip);
-	for (i = 0; i < 4; i++)
-		hop[i] = defv;
+		/* Update predicate for tbl24 entries: (valid && valid_group) */
+		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv);
+		if (svptest_any(pg, pv)) {
+			/* Compute tbl8 index */
+			v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
+			v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+			v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
+					v_idx);
+			/* Extract values from tbl8[] */
+			v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
+							v_idx);
+			/* Update predicate for tbl8 entries: (valid) */
+			pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
+			svst1(pv, &next_hops[i], v_tbl8);
+		}
+	}
 
-	__rte_lpm_lookup_vec(lpm, ips, hop, 4);
+	return 0;
 }
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.25.1

next             reply	other threads:[~2023-01-09  9:28 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-09  9:28 Ruifeng Wang [this message]
2023-02-07 11:15 ` Thomas Monjalon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230109092807.2813215-1-ruifeng.wang@arm.com \
    --to=ruifeng.wang@arm.com \
    --cc=bruce.richardson@intel.com \
    --cc=dev@dpdk.org \
    --cc=fengchengwen@huawei.com \
    --cc=honnappa.nagarahalli@arm.com \
    --cc=jerinj@marvell.com \
    --cc=nd@arm.com \
    --cc=rbhansali@marvell.com \
    --cc=vladimir.medvedkin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).