[PATCH] lpm: use SVE for bulk lookup

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH] lpm: use SVE for bulk lookup
@ 2023-01-09  9:28 Ruifeng Wang
  2023-02-07 11:15 ` Thomas Monjalon
  0 siblings, 1 reply; 2+ messages in thread
From: Ruifeng Wang @ 2023-01-09  9:28 UTC (permalink / raw)
  To: bruce.richardson, vladimir.medvedkin, rbhansali, fengchengwen
  Cc: dev, jerinj, honnappa.nagarahalli, nd, Ruifeng Wang

SVE was used for lookupx4 where four entries are looked up at a time.
It is not an ideal case for SVE implementation which does not bound
to vector length.
Changed to use SVE implementation in bulk lookup when the feature is
available. And optimized the SVE implementation.
The lookupx4 sticks to NEON implementation.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/lpm/rte_lpm.h     |  6 ++-
 lib/lpm/rte_lpm_sve.h | 96 +++++++++++++++++++++++++------------------
 2 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index 09ad307869..75e27ff164 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -400,9 +400,11 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #if defined(RTE_ARCH_ARM)
 #ifdef RTE_HAS_SVE_ACLE
 #include "rte_lpm_sve.h"
-#else
-#include "rte_lpm_neon.h"
+#undef rte_lpm_lookup_bulk
+#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \
+		__rte_lpm_lookup_vec(lpm, ips, next_hops, n)
 #endif
+#include "rte_lpm_neon.h"
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
 #elif defined(RTE_ARCH_X86)
diff --git a/lib/lpm/rte_lpm_sve.h b/lib/lpm/rte_lpm_sve.h
index 94ead70c39..52b8bd4940 100644
--- a/lib/lpm/rte_lpm_sve.h
+++ b/lib/lpm/rte_lpm_sve.h
@@ -12,18 +12,21 @@
 extern "C" {
 #endif
 
-__rte_internal
-static void
+static inline int
 __rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
 		uint32_t *__rte_restrict next_hops, const uint32_t n)
 {
-	uint32_t i = 0;
-	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8, v_hop;
-	svuint32_t v_mask_xv, v_mask_v, v_mask_hop;
-	svbool_t pg = svwhilelt_b32(i, n);
+	uint32_t i;
+	uint64_t vl = svcntw();
+	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8;
+	svuint32_t v_mask_xv, v_mask_v;
+	svbool_t pg = svptrue_b32();
 	svbool_t pv;
 
-	do {
+	for (i = 0; i < n; i++)
+		next_hops[i] = 0;
+
+	for (i = 0; i < n - vl; i += vl) {
 		v_ip = svld1(pg, &ips[i]);
 		/* Get indices for tbl24[] */
 		v_idx = svlsr_x(pg, v_ip, 8);
@@ -37,46 +40,61 @@ __rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
 		v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK);
 		/* Create predicate for tbl24 entries: (valid && !valid_group) */
 		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v);
-		/* Create mask for next_hop in table entry */
-		v_mask_hop = svdup_u32_z(pg, 0x00ffffff);
-		/* Extract next_hop and write back */
-		v_hop = svand_x(pv, v_tbl24, v_mask_hop);
-		svst1(pv, &next_hops[i], v_hop);
+		svst1(pv, &next_hops[i], v_tbl24);
 
 		/* Update predicate for tbl24 entries: (valid && valid_group) */
 		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv);
-		/* Compute tbl8 index */
-		v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
-		v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
-		v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
-				v_idx);
-		/* Extract values from tbl8[] */
-		v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
-						v_idx);
-		/* Update predicate for tbl8 entries: (valid) */
-		pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
-		/* Extract next_hop and write back */
-		v_hop = svand_x(pv, v_tbl8, v_mask_hop);
-		svst1(pv, &next_hops[i], v_hop);
+		if (svptest_any(pg, pv)) {
+			/* Compute tbl8 index */
+			v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
+			v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+			v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
+					v_idx);
+			/* Extract values from tbl8[] */
+			v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
+							v_idx);
+			/* Update predicate for tbl8 entries: (valid) */
+			pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
+			svst1(pv, &next_hops[i], v_tbl8);
+		}
+	}
 
-		i += svlen(v_ip);
-		pg = svwhilelt_b32(i, n);
-	} while (svptest_any(svptrue_b32(), pg));
-}
+	pg = svwhilelt_b32(i, n);
+	if (svptest_any(svptrue_b32(), pg)) {
+		v_ip = svld1(pg, &ips[i]);
+		/* Get indices for tbl24[] */
+		v_idx = svlsr_x(pg, v_ip, 8);
+		/* Extract values from tbl24[] */
+		v_tbl24 = svld1_gather_index(pg, (const uint32_t *)lpm->tbl24,
+						v_idx);
 
-static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
-		uint32_t defv)
-{
-	uint32_t i, ips[4];
+		/* Create mask with valid set */
+		v_mask_v = svdup_u32_z(pg, RTE_LPM_LOOKUP_SUCCESS);
+		/* Create mask with valid and valid_group set */
+		v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK);
+		/* Create predicate for tbl24 entries: (valid && !valid_group) */
+		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v);
+		svst1(pv, &next_hops[i], v_tbl24);
 
-	vst1q_s32((int32_t *)ips, ip);
-	for (i = 0; i < 4; i++)
-		hop[i] = defv;
+		/* Update predicate for tbl24 entries: (valid && valid_group) */
+		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv);
+		if (svptest_any(pg, pv)) {
+			/* Compute tbl8 index */
+			v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
+			v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+			v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
+					v_idx);
+			/* Extract values from tbl8[] */
+			v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
+							v_idx);
+			/* Update predicate for tbl8 entries: (valid) */
+			pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
+			svst1(pv, &next_hops[i], v_tbl8);
+		}
+	}
 
-	__rte_lpm_lookup_vec(lpm, ips, hop, 4);
+	return 0;
 }
-
 #ifdef __cplusplus
 }
 #endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] lpm: use SVE for bulk lookup
  2023-01-09  9:28 [PATCH] lpm: use SVE for bulk lookup Ruifeng Wang
@ 2023-02-07 11:15 ` Thomas Monjalon
  0 siblings, 0 replies; 2+ messages in thread
From: Thomas Monjalon @ 2023-02-07 11:15 UTC (permalink / raw)
  To: Ruifeng Wang
  Cc: bruce.richardson, vladimir.medvedkin, rbhansali, fengchengwen,
	dev, jerinj, honnappa.nagarahalli, nd

09/01/2023 10:28, Ruifeng Wang:
> SVE was used for lookupx4 where four entries are looked up at a time.
> It is not an ideal case for SVE implementation which does not bound
> to vector length.
> Changed to use SVE implementation in bulk lookup when the feature is
> available. And optimized the SVE implementation.
> The lookupx4 sticks to NEON implementation.
> 
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>

Applied, thanks.




^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-02-07 11:15 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-09  9:28 [PATCH] lpm: use SVE for bulk lookup Ruifeng Wang
2023-02-07 11:15 ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).