[PATCH 2/3] lib/lpm: R-V V rte_lpm

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4
@ 2025-05-28 17:00 uk7b
  2025-05-30 13:13 ` Medvedkin, Vladimir
  0 siblings, 1 reply; 5+ messages in thread
From: uk7b @ 2025-05-28 17:00 UTC (permalink / raw)
  To: dev
  Cc: sunyuechi, Thomas Monjalon, Bruce Richardson, Vladimir Medvedkin,
	Stanislaw Kardach

From: sunyuechi <sunyuechi@iscas.ac.cn>

bpi-f3:
    scalar: 5.7 cycles
    rvv:    2.4 cycles

Maybe runtime detection in LPM should be added for all architectures,
but this commit is only about the RVV part.

Signed-off-by: sunyuechi <sunyuechi@iscas.ac.cn>
---
 MAINTAINERS           |  2 +
 lib/lpm/meson.build   |  1 +
 lib/lpm/rte_lpm.h     |  2 +
 lib/lpm/rte_lpm_rvv.h | 91 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+)
 create mode 100644 lib/lpm/rte_lpm_rvv.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3e16789250..0f207ac129 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -340,6 +340,8 @@ M: Stanislaw Kardach <stanislaw.kardach@gmail.com>
 F: config/riscv/
 F: doc/guides/linux_gsg/cross_build_dpdk_for_riscv.rst
 F: lib/eal/riscv/
+M: sunyuechi <sunyuechi@iscas.ac.cn>
+F: lib/**/*rvv*
 
 Intel x86
 M: Bruce Richardson <bruce.richardson@intel.com>
diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
index fae4f79fb9..09133061e5 100644
--- a/lib/lpm/meson.build
+++ b/lib/lpm/meson.build
@@ -17,6 +17,7 @@ indirect_headers += files(
         'rte_lpm_scalar.h',
         'rte_lpm_sse.h',
         'rte_lpm_sve.h',
+        'rte_lpm_rvv.h',
 )
 deps += ['hash']
 deps += ['rcu']
diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index 7df64f06b1..b06517206f 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -408,6 +408,8 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #include "rte_lpm_altivec.h"
 #elif defined(RTE_ARCH_X86)
 #include "rte_lpm_sse.h"
+#elif defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
+#include "rte_lpm_rvv.h"
 #else
 #include "rte_lpm_scalar.h"
 #endif
diff --git a/lib/lpm/rte_lpm_rvv.h b/lib/lpm/rte_lpm_rvv.h
new file mode 100644
index 0000000000..d6aa1500be
--- /dev/null
+++ b/lib/lpm/rte_lpm_rvv.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS).
+ */
+
+#ifndef _RTE_LPM_RVV_H_
+#define _RTE_LPM_RVV_H_
+
+#include <rte_vect.h>
+
+#include <rte_cpuflags.h>
+#include <riscv_vector.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_LPM_LOOKUP_SUCCESS 0x01000000
+#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x03000000
+
+typedef void (*lpm_lookupx4_fn)(const struct rte_lpm *, xmm_t, uint32_t[4], uint32_t);
+
+static lpm_lookupx4_fn lpm_lookupx4_impl;
+
+static inline void rte_lpm_lookupx4_scalar(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	uint32_t nh;
+	int ret;
+
+	for (int i = 0; i < 4; i++) {
+		ret = rte_lpm_lookup(lpm, ip[i], &nh);
+		hop[i] = (ret == 0) ? nh : defv;
+	}
+}
+
+static inline void rte_lpm_lookupx4_rvv(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	size_t vl = 4;
+
+	const uint32_t *tbl24_p = (const uint32_t *)lpm->tbl24;
+	uint32_t tbl_entries[4] = {
+		tbl24_p[((uint32_t)ip[0]) >> 8],
+		tbl24_p[((uint32_t)ip[1]) >> 8],
+		tbl24_p[((uint32_t)ip[2]) >> 8],
+		tbl24_p[((uint32_t)ip[3]) >> 8],
+	};
+	vuint32m1_t vtbl_entry = __riscv_vle32_v_u32m1(tbl_entries, vl);
+
+	vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(
+	    __riscv_vand_vx_u32m1(vtbl_entry, RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl),
+	    RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl);
+
+	vuint32m1_t vtbl8_index = __riscv_vsll_vx_u32m1(
+	    __riscv_vadd_vv_u32m1(
+		__riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(vtbl_entry, 0x00FFFFFF, vl), 8, vl),
+		__riscv_vand_vx_u32m1(
+		    __riscv_vle32_v_u32m1((const uint32_t *)&ip, vl), 0x000000FF, vl),
+		vl),
+	    2, vl);
+
+	vtbl_entry = __riscv_vluxei32_v_u32m1_mu(
+	    mask, vtbl_entry, (const uint32_t *)(lpm->tbl8), vtbl8_index, vl);
+
+	vuint32m1_t vnext_hop = __riscv_vand_vx_u32m1(vtbl_entry, 0x00FFFFFF, vl);
+	mask = __riscv_vmseq_vx_u32m1_b32(
+	    __riscv_vand_vx_u32m1(vtbl_entry, RTE_LPM_LOOKUP_SUCCESS, vl), 0, vl);
+
+	vnext_hop = __riscv_vmerge_vxm_u32m1(vnext_hop, defv, mask, vl);
+
+	__riscv_vse32_v_u32m1(hop, vnext_hop, vl);
+}
+
+static inline void rte_lpm_lookupx4(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	lpm_lookupx4_impl(lpm, ip, hop, defv);
+}
+
+RTE_INIT(rte_lpm_init_alg)
+{
+	lpm_lookupx4_impl = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V)
+	    ? rte_lpm_lookupx4_rvv
+	    : rte_lpm_lookupx4_scalar;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_RVV_H_ */
-- 
2.49.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4
  2025-05-28 17:00 [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4 uk7b
@ 2025-05-30 13:13 ` Medvedkin, Vladimir
  2025-06-04 11:39   ` 孙越池
  0 siblings, 1 reply; 5+ messages in thread
From: Medvedkin, Vladimir @ 2025-05-30 13:13 UTC (permalink / raw)
  To: uk7b, dev; +Cc: sunyuechi, Thomas Monjalon, Bruce Richardson, Stanislaw Kardach

[-- Attachment #1: Type: text/plain, Size: 2249 bytes --]

Hi Sunyuechi,


On 28/05/2025 18:00, uk7b@foxmail.com wrote:
> From: sunyuechi<sunyuechi@iscas.ac.cn>
>
> bpi-f3:
>      scalar: 5.7 cycles
>      rvv:    2.4 cycles
>
> Maybe runtime detection in LPM should be added for all architectures,
> but this commit is only about the RVV part.

Iwouldadviseyouto lookintothe FIBlibrary,ithasexactlywhatyouare lookingfor.

Also,pleaseconsiderwritinga slightlymoreinformativeandexplanatorycommit 
message.

> Signed-off-by: sunyuechi<sunyuechi@iscas.ac.cn>
> ---
>   MAINTAINERS           |  2 +
>   lib/lpm/meson.build   |  1 +
>   lib/lpm/rte_lpm.h     |  2 +
>   lib/lpm/rte_lpm_rvv.h | 91 +++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 96 insertions(+)
>   create mode 100644 lib/lpm/rte_lpm_rvv.h
>
<snip>
> +static inline void rte_lpm_lookupx4_rvv(
> +	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
> +{
> +	size_t vl = 4;
> +
> +	const uint32_t *tbl24_p = (const uint32_t *)lpm->tbl24;
> +	uint32_t tbl_entries[4] = {
> +		tbl24_p[((uint32_t)ip[0]) >> 8],
> +		tbl24_p[((uint32_t)ip[1]) >> 8],
> +		tbl24_p[((uint32_t)ip[2]) >> 8],
> +		tbl24_p[((uint32_t)ip[3]) >> 8],
> +	};

I'm notan expertinRISC-V,butwhyis itdonein a 
scalarwayinsteadofusing__riscv_vsrl_vx_u32m1()? Iassumeyou're 
relyingonthe compilerhere?

Also,have youredefinedthe xmm_t typeforproperindexaddressing?

> +	vuint32m1_t vtbl_entry = __riscv_vle32_v_u32m1(tbl_entries, vl);
> +
> +	vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(
> +	    __riscv_vand_vx_u32m1(vtbl_entry, RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl),
> +	    RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl);
<snip>
> +
> +static inline void rte_lpm_lookupx4(
> +	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
> +{
> +	lpm_lookupx4_impl(lpm, ip, hop, defv);
> +}
> +
> +RTE_INIT(rte_lpm_init_alg)
> +{
> +	lpm_lookupx4_impl = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V)
> +	    ? rte_lpm_lookupx4_rvv
> +	    : rte_lpm_lookupx4_scalar;
> +}
AsImentionedearlier,I'd recommendthat youuseFIBtoselectan 
implementationatruntime. All the rest LPM vector x4 implementations are 
done this way, and their code is inlined.
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_LPM_RVV_H_ */

-- 
Regards,
Vladimir

[-- Attachment #2: Type: text/html, Size: 14247 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4
  2025-05-30 13:13 ` Medvedkin, Vladimir
@ 2025-06-04 11:39   ` 孙越池
  2025-06-04 17:03     ` Medvedkin, Vladimir
  0 siblings, 1 reply; 5+ messages in thread
From: 孙越池 @ 2025-06-04 11:39 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev

[-- Attachment #1: Type: text/plain, Size: 4271 bytes --]

> why is it done in a scalar way instead of using `__riscv_vsrl_vx_u32m1()?` I assume you're relying on the compiler here?

I don't know the exact reason, but based on experience, using indexed loads tends to be slower for small-scale and low-computation cases. So I've tried both methods.
In this case, if using `vsrl`, it would require `__riscv_vluxei32_v_u32m1`, which is much slower.

```
vuint32m1_t vip_shifted = __riscv_vsll_vx_u32m1(__riscv_vsrl_vx_u32m1(__riscv_vle32_v_u32m1((const uint32_t *)&ip, vl), 8, vl), 2, vl);
vuint32m1_t vtbl_entry = __riscv_vluxei32_v_u32m1(
    (const uint32_t *)(lpm->tbl24), vip_shifted, vl);
```

> have you redefined the xmm_t type for proper index addressing?

It is in `eal/riscv/include/rte_vect.h:`

```
typedef int32_t xmm_t __attribute__((vector_size(16)));
```

> I'd recommend that you use FIB to select an implementation at runtime. All the rest LPM vector x4 implementations are done this way, and their code is inlined.
> Also, please consider writing a slightly more informative and explanatory commit message.

I agree that the FIB approach is clearly better here, but adopting this method would require changing the function initialization logic for all architectures in LPM, as well as updating the relevant structures.

I'm not sure it's worth doing right now, since this commit is intended to be just a small change for RISC-V. I'm more inclined to follow the existing structure and avoid touching other architectures' code.
Would it be acceptable to leave this kind of refactoring for the future?

If you're certain it should be done now, I'll make the changes. For now, I've only updated the commit message to include this idea (v2).






-----原始邮件-----
发件人:"Medvedkin, Vladimir" <vladimir.medvedkin@intel.com>
发送时间:2025-05-30 21:13:57 (星期五)
收件人: uk7b@foxmail.com, dev@dpdk.org
抄送: sunyuechi <sunyuechi@iscas.ac.cn>, "Thomas Monjalon" <thomas@monjalon.net>, "Bruce Richardson" <bruce.richardson@intel.com>, "Stanislaw Kardach" <stanislaw.kardach@gmail.com>
主题: Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4



Hi Sunyuechi,




On 28/05/2025 18:00, uk7b@foxmail.com wrote:

From: sunyuechi <sunyuechi@iscas.ac.cn> bpi-f3:
    scalar: 5.7 cycles
    rvv:    2.4 cycles

Maybe runtime detection in LPM should be added for all architectures,
but this commit is only about the RVV part.


Iwouldadviseyou to lookinto the FIBlibrary,ithasexactlywhatyou are looking for.

Also,pleaseconsiderwriting a slightlymoreinformativeandexplanatory commit message.

Signed-off-by: sunyuechi <sunyuechi@iscas.ac.cn> ---
 MAINTAINERS           |  2 +
 lib/lpm/meson.build   |  1 +
 lib/lpm/rte_lpm.h     |  2 +
 lib/lpm/rte_lpm_rvv.h | 91 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+)
 create mode 100644 lib/lpm/rte_lpm_rvv.h


<snip>
+static inline void rte_lpm_lookupx4_rvv(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	size_t vl = 4;
+
+	const uint32_t *tbl24_p = (const uint32_t *)lpm->tbl24;
+	uint32_t tbl_entries[4] = {
+		tbl24_p[((uint32_t)ip[0]) >> 8],
+		tbl24_p[((uint32_t)ip[1]) >> 8],
+		tbl24_p[((uint32_t)ip[2]) >> 8],
+		tbl24_p[((uint32_t)ip[3]) >> 8],
+	};

I'm not an expertinRISC-V,butwhy is itdone in a scalarwayinsteadofusing __riscv_vsrl_vx_u32m1()? Iassumeyou're relyingon the compilerhere?

Also, have youredefined the xmm_t typeforproperindexaddressing?

+	vuint32m1_t vtbl_entry = __riscv_vle32_v_u32m1(tbl_entries, vl);
+
+	vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(
+	    __riscv_vand_vx_u32m1(vtbl_entry, RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl),
+	    RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl);

<snip>
+
+static inline void rte_lpm_lookupx4(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	lpm_lookupx4_impl(lpm, ip, hop, defv);
+}
+
+RTE_INIT(rte_lpm_init_alg)
+{
+	lpm_lookupx4_impl = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V)
+	    ? rte_lpm_lookupx4_rvv
+	    : rte_lpm_lookupx4_scalar;
+}
AsImentionedearlier,I'd recommend that youuseFIBtoselect an implementationatruntime. All the rest LPM vector x4 implementations are done this way, and their code is inlined.
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_RVV_H_ */

-- 
Regards,
Vladimir

[-- Attachment #2: Type: text/html, Size: 16423 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4
  2025-06-04 11:39   ` 孙越池
@ 2025-06-04 17:03     ` Medvedkin, Vladimir
  2025-06-05 10:59       ` 孙越池
  0 siblings, 1 reply; 5+ messages in thread
From: Medvedkin, Vladimir @ 2025-06-04 17:03 UTC (permalink / raw)
  To: 孙越池, dev

[-- Attachment #1: Type: text/plain, Size: 6059 bytes --]

Hi Sunyuechi,


On 04/06/2025 12:39, 孙越池 wrote:
> > why is it done in a scalar way instead of using 
> `__riscv_vsrl_vx_u32m1()?` I assume you're relying on the compiler here?
>
> I don't know the exact reason, but based on experience, using indexed 
> loads tends to be slower for small-scale and low-computation cases. So 
> I've tried both methods.
> In this case, if using `vsrl`, it would require 
> `__riscv_vluxei32_v_u32m1`, which is much slower.
>
> ```
> vuint32m1_t vip_shifted = 
> __riscv_vsll_vx_u32m1(__riscv_vsrl_vx_u32m1(__riscv_vle32_v_u32m1((const 
> uint32_t *)&ip, vl), 8, vl), 2, vl);
> vuint32m1_t vtbl_entry = __riscv_vluxei32_v_u32m1(
>     (const uint32_t *)(lpm->tbl24), vip_shifted, vl);
> ```
>
> > have you redefined the xmm_t type for proper index addressing?
>
> It is in `eal/riscv/include/rte_vect.h:`
>
> ```
> typedef int32_t xmm_t __attribute__((vector_size(16)));
> ```
>
> > I'd recommend that you use FIB to select an implementation at 
> runtime. All the rest LPM vector x4 implementations are done this way, 
> and their code is inlined.
> > Also, please consider writing a slightly more informative and 
> explanatory commit message.

The commit message still looks uninformative to me:

 >lpm_perf_autotest on BPI-F3

we have no idea what's that

 > scalar: 5.7 cycles

I'm not sure we want to have this information in commit message as well, 
because it is useless. Cycles depends on so much variable parts - what 
freq of the CPU was, what speed of memory, size of caches, and so on. 
This information is irrelevant and become obsolete pretty fast.

 From the latest commit:

 >The best way ... However, ... Therefore, ... this commit does not modify

 >Unifying the code style between lpm and fib may be worth considering 
in the future.

I don't think this is a good idea to put into the commit message 
information about what was NOT done.

You should put all this information (platform you were running, 
performance, implementation considerations and thoughts) into the patch 
notes.

>
> I agree that the FIB approach is clearly better here, but adopting 
> this method would require changing the function initialization logic 
> for all architectures in LPM, as well as updating the relevant structures.
>
> I'm not sure it's worth doing right now, since this commit is intended 
> to be just a small change for RISC-V. I'm more inclined to follow the 
> existing structure and avoid touching other architectures' code.
> Would it be acceptable to leave this kind of refactoring for the future?
>
> If you're certain it should be done now, I'll make the changes. For 
> now, I've only updated the commit message to include this idea (v2).
>
>
I'm not talking about adopting the FIB approach to the LPM. Instead, I 
suggested keeping LPM code consistent and leaving your implementation as 
a static inline function. And if you want to have runtime CPU flags 
check - you're welcome to do so in the FIB.

>
>
>     -----原始邮件-----
>     *发件人:*"Medvedkin, Vladimir" <vladimir.medvedkin@intel.com>
>     *发送时间:*2025-05-30 21:13:57 (星期五)
>     *收件人:* uk7b@foxmail.com, dev@dpdk.org
>     *抄送:* sunyuechi <sunyuechi@iscas.ac.cn>, "Thomas Monjalon"
>     <thomas@monjalon.net>, "Bruce Richardson"
>     <bruce.richardson@intel.com>, "Stanislaw Kardach"
>     <stanislaw.kardach@gmail.com>
>     *主题:* Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4
>
>     Hi c,
>
>
>     On 28/05/2025 18:00, uk7b@foxmail.com wrote:
>>     From: sunyuechi<sunyuechi@iscas.ac.cn> bpi-f3:
>>          scalar: 5.7 cycles
>>          rvv:    2.4 cycles
>>
>>     Maybe runtime detection in LPM should be added for all architectures,
>>     but this commit is only about the RVV part.
>
>     Iwouldadviseyouto lookintothe FIBlibrary,ithasexactlywhatyouare
>     lookingfor.
>
>     Also,pleaseconsiderwritinga
>     slightlymoreinformativeandexplanatorycommit message.
>
>>     Signed-off-by: sunyuechi<sunyuechi@iscas.ac.cn> ---
>>       MAINTAINERS           |  2 +
>>       lib/lpm/meson.build   |  1 +
>>       lib/lpm/rte_lpm.h     |  2 +
>>       lib/lpm/rte_lpm_rvv.h | 91 +++++++++++++++++++++++++++++++++++++++++++
>>       4 files changed, 96 insertions(+)
>>       create mode 100644 lib/lpm/rte_lpm_rvv.h
>>
>     <snip>
>>     +static inline void rte_lpm_lookupx4_rvv(
>>     +	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
>>     +{
>>     +	size_t vl = 4;
>>     +
>>     +	const uint32_t *tbl24_p = (const uint32_t *)lpm->tbl24;
>>     +	uint32_t tbl_entries[4] = {
>>     +		tbl24_p[((uint32_t)ip[0]) >> 8],
>>     +		tbl24_p[((uint32_t)ip[1]) >> 8],
>>     +		tbl24_p[((uint32_t)ip[2]) >> 8],
>>     +		tbl24_p[((uint32_t)ip[3]) >> 8],
>>     +	};
>
>     I'm notan expertinRISC-V,butwhyis itdonein a
>     scalarwayinsteadofusing__riscv_vsrl_vx_u32m1()? Iassumeyou're
>     relyingonthe compilerhere?
>
>     Also,have youredefinedthe xmm_t typeforproperindexaddressing?
>
>>     +	vuint32m1_t vtbl_entry = __riscv_vle32_v_u32m1(tbl_entries, vl);
>>     +
>>     +	vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(
>>     +	    __riscv_vand_vx_u32m1(vtbl_entry, RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl),
>>     +	    RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl);
>     <snip>
>>     +
>>     +static inline void rte_lpm_lookupx4(
>>     +	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
>>     +{
>>     +	lpm_lookupx4_impl(lpm, ip, hop, defv);
>>     +}
>>     +
>>     +RTE_INIT(rte_lpm_init_alg)
>>     +{
>>     +	lpm_lookupx4_impl = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V)
>>     +	    ? rte_lpm_lookupx4_rvv
>>     +	    : rte_lpm_lookupx4_scalar;
>>     +}
>     AsImentionedearlier,I'd recommendthat youuseFIBtoselectan
>     implementationatruntime. All the rest LPM vector x4
>     implementations are done this way, and their code is inlined.
>>     +
>>     +#ifdef __cplusplus
>>     +}
>>     +#endif
>>     +
>>     +#endif /* _RTE_LPM_RVV_H_ */
>
>     -- 
>     Regards,
>     Vladimir
>
-- 
Regards,
Vladimir

[-- Attachment #2: Type: text/html, Size: 19887 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4
  2025-06-04 17:03     ` Medvedkin, Vladimir
@ 2025-06-05 10:59       ` 孙越池
  0 siblings, 0 replies; 5+ messages in thread
From: 孙越池 @ 2025-06-05 10:59 UTC (permalink / raw)
  To: Medvedkin, Vladimir, dev

[-- Attachment #1: Type: text/plain, Size: 5770 bytes --]

Thank you for the detailed explanation, I've updated it in v4.

-----原始邮件-----
发件人:"Medvedkin, Vladimir" <vladimir.medvedkin@intel.com>
发送时间:2025-06-05 01:03:30 (星期四)
收件人: 孙越池 <sunyuechi@iscas.ac.cn>, dev@dpdk.org
抄送:
主题: Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4

Hi Sunyuechi,

On 04/06/2025 12:39, 孙越池 wrote:

> why is it done in a scalar way instead of using `__riscv_vsrl_vx_u32m1()?` I assume you're relying on the compiler here?

I don't know the exact reason, but based on experience, using indexed loads tends to be slower for small-scale and low-computation cases. So I've tried both methods.
In this case, if using `vsrl`, it would require `__riscv_vluxei32_v_u32m1`, which is much slower.

```
vuint32m1_t vip_shifted = __riscv_vsll_vx_u32m1(__riscv_vsrl_vx_u32m1(__riscv_vle32_v_u32m1((const uint32_t *)&ip, vl), 8, vl), 2, vl);
vuint32m1_t vtbl_entry = __riscv_vluxei32_v_u32m1(
    (const uint32_t *)(lpm->tbl24), vip_shifted, vl);
```

> have you redefined the xmm_t type for proper index addressing?

It is in `eal/riscv/include/rte_vect.h:`

```
typedef int32_t xmm_t __attribute__((vector_size(16)));
```

> I'd recommend that you use FIB to select an implementation at runtime. All the rest LPM vector x4 implementations are done this way, and their code is inlined.
> Also, please consider writing a slightly more informative and explanatory commit message.

The commit message still looks uninformative to me:

>lpm_perf_autotest on BPI-F3

we have no idea what's that

> scalar: 5.7 cycles

I'm not sure we want to have this information in commit message as well, because it is useless. Cycles depends on so much variable parts - what freq of the CPU was, what speed of memory, size of caches, and so on. This information is irrelevant and become obsolete pretty fast.

From the latest commit:

>The best way ... However, ... Therefore, ... this commit does not modify

>Unifying the code style between lpm and fib may be worth considering in the future.

I don't think this is a good idea to put into the commit message information about what was NOT done.

You should put all this information (platform you were running, performance, implementation considerations and thoughts) into the patch notes.

I agree that the FIB approach is clearly better here, but adopting this method would require changing the function initialization logic for all architectures in LPM, as well as updating the relevant structures.

I'm not sure it's worth doing right now, since this commit is intended to be just a small change for RISC-V. I'm more inclined to follow the existing structure and avoid touching other architectures' code.
Would it be acceptable to leave this kind of refactoring for the future?

If you're certain it should be done now, I'll make the changes. For now, I've only updated the commit message to include this idea (v2).

I'm not talking about adopting the FIB approach to the LPM. Instead, I suggested keeping LPM code consistent and leaving your implementation as a static inline function. And if you want to have runtime CPU flags check - you're welcome to do so in the FIB.

-----原始邮件-----
发件人:"Medvedkin, Vladimir" <vladimir.medvedkin@intel.com>
发送时间:2025-05-30 21:13:57 (星期五)
收件人:uk7b@foxmail.com, dev@dpdk.org
抄送: sunyuechi <sunyuechi@iscas.ac.cn>, "Thomas Monjalon" <thomas@monjalon.net>, "Bruce Richardson" <bruce.richardson@intel.com>, "Stanislaw Kardach" <stanislaw.kardach@gmail.com>
主题: Re: [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4

Hi c,

On 28/05/2025 18:00, uk7b@foxmail.com wrote:

From: sunyuechi <sunyuechi@iscas.ac.cn> bpi-f3:
    scalar: 5.7 cycles
    rvv:    2.4 cycles

Maybe runtime detection in LPM should be added for all architectures,
but this commit is only about the RVV part.

Iwouldadviseyou to lookinto the FIBlibrary,ithasexactlywhatyou are looking for.

Also,pleaseconsiderwriting a slightlymoreinformativeandexplanatory commit message.

Signed-off-by: sunyuechi <sunyuechi@iscas.ac.cn> ---
 MAINTAINERS           |  2 +
 lib/lpm/meson.build   |  1 +
 lib/lpm/rte_lpm.h     |  2 +
 lib/lpm/rte_lpm_rvv.h | 91 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+)
 create mode 100644 lib/lpm/rte_lpm_rvv.h

<snip>
+static inline void rte_lpm_lookupx4_rvv(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	size_t vl = 4;
+
+	const uint32_t *tbl24_p = (const uint32_t *)lpm->tbl24;
+	uint32_t tbl_entries[4] = {
+		tbl24_p[((uint32_t)ip[0]) >> 8],
+		tbl24_p[((uint32_t)ip[1]) >> 8],
+		tbl24_p[((uint32_t)ip[2]) >> 8],
+		tbl24_p[((uint32_t)ip[3]) >> 8],
+	};

I'm not an expertinRISC-V,butwhy is itdone in a scalarwayinsteadofusing __riscv_vsrl_vx_u32m1()? Iassumeyou're relyingon the compilerhere?

Also, have youredefined the xmm_t typeforproperindexaddressing?

+	vuint32m1_t vtbl_entry = __riscv_vle32_v_u32m1(tbl_entries, vl);
+
+	vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(
+	    __riscv_vand_vx_u32m1(vtbl_entry, RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl),
+	    RTE_LPM_VALID_EXT_ENTRY_BITMASK, vl);

<snip>
+
+static inline void rte_lpm_lookupx4(
+	const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
+{
+	lpm_lookupx4_impl(lpm, ip, hop, defv);
+}
+
+RTE_INIT(rte_lpm_init_alg)
+{
+	lpm_lookupx4_impl = rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_ISA_V)
+	    ? rte_lpm_lookupx4_rvv
+	    : rte_lpm_lookupx4_scalar;
+}
AsImentionedearlier,I'd recommend that youuseFIBtoselect an implementationatruntime. All the rest LPM vector x4 implementations are done this way, and their code is inlined.
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_RVV_H_ */

-- 
Regards,
Vladimir
-- 
Regards,
Vladimir

[-- Attachment #2: Type: text/html, Size: 19763 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-06-05 11:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-28 17:00 [PATCH 2/3] lib/lpm: R-V V rte_lpm_lookupx4 uk7b
2025-05-30 13:13 ` Medvedkin, Vladimir
2025-06-04 11:39   ` 孙越池
2025-06-04 17:03     ` Medvedkin, Vladimir
2025-06-05 10:59       ` 孙越池

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).