[PATCH 1/1] lpm: add a scalar version of lookupx4 function

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH 1/1] lpm: add a scalar version of lookupx4 function
@ 2022-05-10 11:58 Stanislaw Kardach
  2022-05-19 17:02 ` Medvedkin, Vladimir
  2022-05-27 18:18 ` [PATCH v2 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  0 siblings, 2 replies; 19+ messages in thread
From: Stanislaw Kardach @ 2022-05-10 11:58 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, mw, upstream,
	Stanislaw Kardach

From: Michal Mazurek <maz@semihalf.com>

Add an implementation of the rte_lpm_lookupx4() function for platforms
without support for vector operations.

This will be useful in the upcoming RISC-V port as well as any platform
which may want to start with a basic level of LPM support.

Signed-off-by: Michal Mazurek <maz@semihalf.com>
Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
---
 doc/guides/rel_notes/release_22_07.rst |   5 +
 lib/lpm/meson.build                    |   1 +
 lib/lpm/rte_lpm.h                      |   4 +-
 lib/lpm/rte_lpm_scalar.h               | 122 +++++++++++++++++++++++++
 4 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 lib/lpm/rte_lpm_scalar.h

diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
index 4ae91dd94d..73e8d632f2 100644
--- a/doc/guides/rel_notes/release_22_07.rst
+++ b/doc/guides/rel_notes/release_22_07.rst
@@ -70,6 +70,11 @@ New Features
   * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K.
   * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K.
 
+* **Added scalar version of the LPM library.**
+
+  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
+    implementation for platforms that don't support vector operations.
+
 
 Removed Items
 -------------
diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
index 78d91d3421..6b47361fce 100644
--- a/lib/lpm/meson.build
+++ b/lib/lpm/meson.build
@@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
 indirect_headers += files(
         'rte_lpm_altivec.h',
         'rte_lpm_neon.h',
+        'rte_lpm_scalar.h',
         'rte_lpm_sse.h',
         'rte_lpm_sve.h',
 )
diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index eb91960e81..b5db6a353a 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #endif
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
-#else
+#elif defined(RTE_ARCH_X86)
 #include "rte_lpm_sse.h"
+#else
+#include "rte_lpm_scalar.h"
 #endif
 
 #ifdef __cplusplus
diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
new file mode 100644
index 0000000000..991b94e687
--- /dev/null
+++ b/lib/lpm/rte_lpm_scalar.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 StarFive
+ * Copyright(c) 2022 SiFive
+ * Copyright(c) 2022 Semihalf
+ */
+
+#ifndef _RTE_LPM_SCALAR_H_
+#define _RTE_LPM_SCALAR_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+		uint32_t defv)
+{
+	rte_xmm_t i24;
+	rte_xmm_t i8;
+	uint32_t tbl[4];
+	uint64_t pt, pt2;
+	const uint32_t *ptbl;
+
+	const rte_xmm_t mask8 = {
+		.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}};
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
+	 * as one 64-bit value (0x0300000003000000).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
+	 * as one 64-bit value (0x0100000001000000).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
+
+	/* get 4 indexes for tbl24[]. */
+	i24.x = ip;
+	i24.u32[0] >>= CHAR_BIT;
+	i24.u32[1] >>= CHAR_BIT;
+	i24.u32[2] >>= CHAR_BIT;
+	i24.u32[3] >>= CHAR_BIT;
+
+	/* extract values from tbl24[] */
+	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]];
+	tbl[0] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]];
+	tbl[1] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]];
+	tbl[2] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]];
+	tbl[3] = *ptbl;
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = ip;
+	i8.u64[0] &= mask8.u64[0];
+	i8.u64[1] &= mask8.u64[1];
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 32;
+	pt2 = (uint64_t)tbl[2] |
+		(uint64_t)tbl[3] << 32;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v) &&
+			likely((pt2 & mask_xv) == mask_v)) {
+		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
+		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(tbl[0] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
+		tbl[0] = *ptbl;
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(tbl[1] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
+		tbl[1] = *ptbl;
+	}
+	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(tbl[2] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
+		tbl[2] = *ptbl;
+	}
+	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(tbl[3] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
+		tbl[3] = *ptbl;
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv;
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv;
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv;
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_SCALAR_H_ */
-- 
2.30.2

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/1] lpm: add a scalar version of lookupx4 function
  2022-05-10 11:58 [PATCH 1/1] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
@ 2022-05-19 17:02 ` Medvedkin, Vladimir
  2022-05-24 16:28   ` Stanisław Kardach
  2022-05-27 18:18 ` [PATCH v2 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  1 sibling, 1 reply; 19+ messages in thread
From: Medvedkin, Vladimir @ 2022-05-19 17:02 UTC (permalink / raw)
  To: Stanislaw Kardach
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, mw, upstream,
	Bruce Richardson

Hi Stanislaw, Michal,

As far as I can see, this implementation almost completely repeats other 
lookupx4() implementations, except for the use of vector instructions.

On my board (x86_64) in lpm_perf_autotest your implementation takes about:
LPM LookupX4: 29.5 cycles (fails = 12.5%)

replacing this code with a simple loop with rte_lpm_lookup():

uint32_t nh;
int i, ret;

for (i = 0; i < 4; i++) {
   ret = rte_lpm_lookup((struct rte_lpm *)lpm, ((rte_xmm_t)ip).u32[i], &nh);
   hop[i] = (ret == 0) ? nh : defv;
}

works faster:
LPM LookupX4: 22.2 cycles (fails = 12.5%)

I'm wondering if this will work faster on your board (I assume it it 
RISC-V arch)?

Thanks!

On 10/05/2022 12:58, Stanislaw Kardach wrote:
> From: Michal Mazurek <maz@semihalf.com>
> 
> Add an implementation of the rte_lpm_lookupx4() function for platforms
> without support for vector operations.
> 
> This will be useful in the upcoming RISC-V port as well as any platform
> which may want to start with a basic level of LPM support.
> 
> Signed-off-by: Michal Mazurek <maz@semihalf.com>
> Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
> ---
>   doc/guides/rel_notes/release_22_07.rst |   5 +
>   lib/lpm/meson.build                    |   1 +
>   lib/lpm/rte_lpm.h                      |   4 +-
>   lib/lpm/rte_lpm_scalar.h               | 122 +++++++++++++++++++++++++
>   4 files changed, 131 insertions(+), 1 deletion(-)
>   create mode 100644 lib/lpm/rte_lpm_scalar.h
> 
> diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> index 4ae91dd94d..73e8d632f2 100644
> --- a/doc/guides/rel_notes/release_22_07.rst
> +++ b/doc/guides/rel_notes/release_22_07.rst
> @@ -70,6 +70,11 @@ New Features
>     * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K.
>     * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K.
>   
> +* **Added scalar version of the LPM library.**
> +
> +  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
> +    implementation for platforms that don't support vector operations.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
> index 78d91d3421..6b47361fce 100644
> --- a/lib/lpm/meson.build
> +++ b/lib/lpm/meson.build
> @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
>   indirect_headers += files(
>           'rte_lpm_altivec.h',
>           'rte_lpm_neon.h',
> +        'rte_lpm_scalar.h',
>           'rte_lpm_sse.h',
>           'rte_lpm_sve.h',
>   )
> diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
> index eb91960e81..b5db6a353a 100644
> --- a/lib/lpm/rte_lpm.h
> +++ b/lib/lpm/rte_lpm.h
> @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   #endif
>   #elif defined(RTE_ARCH_PPC_64)
>   #include "rte_lpm_altivec.h"
> -#else
> +#elif defined(RTE_ARCH_X86)
>   #include "rte_lpm_sse.h"
> +#else
> +#include "rte_lpm_scalar.h"
>   #endif
>   
>   #ifdef __cplusplus
> diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
> new file mode 100644
> index 0000000000..991b94e687
> --- /dev/null
> +++ b/lib/lpm/rte_lpm_scalar.h
> @@ -0,0 +1,122 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 StarFive
> + * Copyright(c) 2022 SiFive
> + * Copyright(c) 2022 Semihalf
> + */
> +
> +#ifndef _RTE_LPM_SCALAR_H_
> +#define _RTE_LPM_SCALAR_H_
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_byteorder.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +static inline void
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +		uint32_t defv)
> +{
> +	rte_xmm_t i24;
> +	rte_xmm_t i8;
> +	uint32_t tbl[4];
> +	uint64_t pt, pt2;
> +	const uint32_t *ptbl;
> +
> +	const rte_xmm_t mask8 = {
> +		.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}};
> +
> +	/*
> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> +	 * as one 64-bit value (0x0300000003000000).
> +	 */
> +	const uint64_t mask_xv =
> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> +	/*
> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> +	 * as one 64-bit value (0x0100000001000000).
> +	 */
> +	const uint64_t mask_v =
> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> +	/* get 4 indexes for tbl24[]. */
> +	i24.x = ip;
> +	i24.u32[0] >>= CHAR_BIT;
> +	i24.u32[1] >>= CHAR_BIT;
> +	i24.u32[2] >>= CHAR_BIT;
> +	i24.u32[3] >>= CHAR_BIT;
> +
> +	/* extract values from tbl24[] */
> +	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]];
> +	tbl[0] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]];
> +	tbl[1] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]];
> +	tbl[2] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]];
> +	tbl[3] = *ptbl;
> +
> +	/* get 4 indexes for tbl8[]. */
> +	i8.x = ip;
> +	i8.u64[0] &= mask8.u64[0];
> +	i8.u64[1] &= mask8.u64[1];
> +
> +	pt = (uint64_t)tbl[0] |
> +		(uint64_t)tbl[1] << 32;
> +	pt2 = (uint64_t)tbl[2] |
> +		(uint64_t)tbl[3] << 32;
> +
> +	/* search successfully finished for all 4 IP addresses. */
> +	if (likely((pt & mask_xv) == mask_v) &&
> +			likely((pt2 & mask_xv) == mask_v)) {
> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> +		return;
> +	}
> +
> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[0] = i8.u32[0] +
> +			(tbl[0] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> +		tbl[0] = *ptbl;
> +	}
> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[1] = i8.u32[1] +
> +			(tbl[1] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> +		tbl[1] = *ptbl;
> +	}
> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[2] = i8.u32[2] +
> +			(tbl[2] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> +		tbl[2] = *ptbl;
> +	}
> +	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[3] = i8.u32[3] +
> +			(tbl[3] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> +		tbl[3] = *ptbl;
> +	}
> +
> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv;
> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv;
> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv;
> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_LPM_SCALAR_H_ */

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/1] lpm: add a scalar version of lookupx4 function
  2022-05-19 17:02 ` Medvedkin, Vladimir
@ 2022-05-24 16:28   ` Stanisław Kardach
  2022-05-27 11:16     ` Stanisław Kardach
  0 siblings, 1 reply; 19+ messages in thread
From: Stanisław Kardach @ 2022-05-24 16:28 UTC (permalink / raw)
  To: Medvedkin, Vladimir
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, Marcin Wojtas,
	upstream, Bruce Richardson

On Thu, May 19, 2022 at 7:04 PM Medvedkin, Vladimir
<vladimir.medvedkin@intel.com> wrote:
>
> Hi Stanislaw, Michal,
>
> As far as I can see, this implementation almost completely repeats other
> lookupx4() implementations, except for the use of vector instructions.
>
> On my board (x86_64) in lpm_perf_autotest your implementation takes about:
> LPM LookupX4: 29.5 cycles (fails = 12.5%)
>
> replacing this code with a simple loop with rte_lpm_lookup():
>
> uint32_t nh;
> int i, ret;
>
> for (i = 0; i < 4; i++) {
>    ret = rte_lpm_lookup((struct rte_lpm *)lpm, ((rte_xmm_t)ip).u32[i], &nh);
>    hop[i] = (ret == 0) ? nh : defv;
> }
>
> works faster:
> LPM LookupX4: 22.2 cycles (fails = 12.5%)
>
> I'm wondering if this will work faster on your board (I assume it it
> RISC-V arch)?
Hi Vladimir,

On my HiFive Unmatched RISC-V board there is a marginal difference (~ -1.56%):
  Our version: 210.5 cycles (fails = 12.5%)
  rte_lpm_lookup version: 213.8 cycles (fails = 12.5%)
Given that x86 is faster with rte_lpm_lookup, I'll change to this
implementation in the next version.

That said I wonder why do we have different const requirements for
rte_lpm_lookup() and rte_lpm_lookupx4():
  static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip,
uint32_t *next_hop)
  static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t
ip, uint32_t hop[4], uint32_t defv);
I think both should be const.


>
> Thanks!
>
> On 10/05/2022 12:58, Stanislaw Kardach wrote:
> > From: Michal Mazurek <maz@semihalf.com>
> >
> > Add an implementation of the rte_lpm_lookupx4() function for platforms
> > without support for vector operations.
> >
> > This will be useful in the upcoming RISC-V port as well as any platform
> > which may want to start with a basic level of LPM support.
> >
> > Signed-off-by: Michal Mazurek <maz@semihalf.com>
> > Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
> > ---
> >   doc/guides/rel_notes/release_22_07.rst |   5 +
> >   lib/lpm/meson.build                    |   1 +
> >   lib/lpm/rte_lpm.h                      |   4 +-
> >   lib/lpm/rte_lpm_scalar.h               | 122 +++++++++++++++++++++++++
> >   4 files changed, 131 insertions(+), 1 deletion(-)
> >   create mode 100644 lib/lpm/rte_lpm_scalar.h
> >
> > diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> > index 4ae91dd94d..73e8d632f2 100644
> > --- a/doc/guides/rel_notes/release_22_07.rst
> > +++ b/doc/guides/rel_notes/release_22_07.rst
> > @@ -70,6 +70,11 @@ New Features
> >     * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K.
> >     * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K.
> >
> > +* **Added scalar version of the LPM library.**
> > +
> > +  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
> > +    implementation for platforms that don't support vector operations.
> > +
> >
> >   Removed Items
> >   -------------
> > diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
> > index 78d91d3421..6b47361fce 100644
> > --- a/lib/lpm/meson.build
> > +++ b/lib/lpm/meson.build
> > @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
> >   indirect_headers += files(
> >           'rte_lpm_altivec.h',
> >           'rte_lpm_neon.h',
> > +        'rte_lpm_scalar.h',
> >           'rte_lpm_sse.h',
> >           'rte_lpm_sve.h',
> >   )
> > diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
> > index eb91960e81..b5db6a353a 100644
> > --- a/lib/lpm/rte_lpm.h
> > +++ b/lib/lpm/rte_lpm.h
> > @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >   #endif
> >   #elif defined(RTE_ARCH_PPC_64)
> >   #include "rte_lpm_altivec.h"
> > -#else
> > +#elif defined(RTE_ARCH_X86)
> >   #include "rte_lpm_sse.h"
> > +#else
> > +#include "rte_lpm_scalar.h"
> >   #endif
> >
> >   #ifdef __cplusplus
> > diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
> > new file mode 100644
> > index 0000000000..991b94e687
> > --- /dev/null
> > +++ b/lib/lpm/rte_lpm_scalar.h
> > @@ -0,0 +1,122 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 StarFive
> > + * Copyright(c) 2022 SiFive
> > + * Copyright(c) 2022 Semihalf
> > + */
> > +
> > +#ifndef _RTE_LPM_SCALAR_H_
> > +#define _RTE_LPM_SCALAR_H_
> > +
> > +#include <rte_branch_prediction.h>
> > +#include <rte_byteorder.h>
> > +#include <rte_common.h>
> > +#include <rte_vect.h>
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +static inline void
> > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +             uint32_t defv)
> > +{
> > +     rte_xmm_t i24;
> > +     rte_xmm_t i8;
> > +     uint32_t tbl[4];
> > +     uint64_t pt, pt2;
> > +     const uint32_t *ptbl;
> > +
> > +     const rte_xmm_t mask8 = {
> > +             .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}};
> > +
> > +     /*
> > +      * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> > +      * as one 64-bit value (0x0300000003000000).
> > +      */
> > +     const uint64_t mask_xv =
> > +             ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> > +             (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> > +
> > +     /*
> > +      * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> > +      * as one 64-bit value (0x0100000001000000).
> > +      */
> > +     const uint64_t mask_v =
> > +             ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> > +             (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> > +
> > +     /* get 4 indexes for tbl24[]. */
> > +     i24.x = ip;
> > +     i24.u32[0] >>= CHAR_BIT;
> > +     i24.u32[1] >>= CHAR_BIT;
> > +     i24.u32[2] >>= CHAR_BIT;
> > +     i24.u32[3] >>= CHAR_BIT;
> > +
> > +     /* extract values from tbl24[] */
> > +     ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]];
> > +     tbl[0] = *ptbl;
> > +     ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]];
> > +     tbl[1] = *ptbl;
> > +     ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]];
> > +     tbl[2] = *ptbl;
> > +     ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]];
> > +     tbl[3] = *ptbl;
> > +
> > +     /* get 4 indexes for tbl8[]. */
> > +     i8.x = ip;
> > +     i8.u64[0] &= mask8.u64[0];
> > +     i8.u64[1] &= mask8.u64[1];
> > +
> > +     pt = (uint64_t)tbl[0] |
> > +             (uint64_t)tbl[1] << 32;
> > +     pt2 = (uint64_t)tbl[2] |
> > +             (uint64_t)tbl[3] << 32;
> > +
> > +     /* search successfully finished for all 4 IP addresses. */
> > +     if (likely((pt & mask_xv) == mask_v) &&
> > +                     likely((pt2 & mask_xv) == mask_v)) {
> > +             *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> > +             *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> > +             return;
> > +     }
> > +
> > +     if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> > +                     RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> > +             i8.u32[0] = i8.u32[0] +
> > +                     (tbl[0] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> > +             ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> > +             tbl[0] = *ptbl;
> > +     }
> > +     if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> > +                     RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> > +             i8.u32[1] = i8.u32[1] +
> > +                     (tbl[1] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> > +             ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> > +             tbl[1] = *ptbl;
> > +     }
> > +     if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> > +                     RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> > +             i8.u32[2] = i8.u32[2] +
> > +                     (tbl[2] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> > +             ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> > +             tbl[2] = *ptbl;
> > +     }
> > +     if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> > +                     RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> > +             i8.u32[3] = i8.u32[3] +
> > +                     (tbl[3] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> > +             ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> > +             tbl[3] = *ptbl;
> > +     }
> > +
> > +     hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv;
> > +     hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv;
> > +     hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv;
> > +     hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
> > +}
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_LPM_SCALAR_H_ */
>
> --
> Regards,
> Vladimir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/1] lpm: add a scalar version of lookupx4 function
  2022-05-24 16:28   ` Stanisław Kardach
@ 2022-05-27 11:16     ` Stanisław Kardach
  2022-05-27 13:16       ` Medvedkin, Vladimir
  0 siblings, 1 reply; 19+ messages in thread
From: Stanisław Kardach @ 2022-05-27 11:16 UTC (permalink / raw)
  To: Medvedkin, Vladimir
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, Marcin Wojtas,
	upstream, Bruce Richardson

On Tue, May 24, 2022 at 6:28 PM Stanisław Kardach <kda@semihalf.com> wrote:
<snip>
> That said I wonder why do we have different const requirements for
> rte_lpm_lookup() and rte_lpm_lookupx4():
>   static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip,
> uint32_t *next_hop)
>   static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t
> ip, uint32_t hop[4], uint32_t defv);
> I think both should be const.
>
To re-iterate the question, should I also post a patch for changing
rte_lpm_lookup() to add "const" to "struct rte_lpm *lpm" argument?
rte_lpm_lookup_bulk_func() and rte_lpm_lookupx4() already take lpm as
const.
I'm pushing because otherwise I get a const discard warning in the
scalar version of rte_lpm_lookupx4() utilizing rte_lpm_lookup().

Best Regards,
Stanislaw Kardach

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/1] lpm: add a scalar version of lookupx4 function
  2022-05-27 11:16     ` Stanisław Kardach
@ 2022-05-27 13:16       ` Medvedkin, Vladimir
  0 siblings, 0 replies; 19+ messages in thread
From: Medvedkin, Vladimir @ 2022-05-27 13:16 UTC (permalink / raw)
  To: Stanisław Kardach
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, Marcin Wojtas,
	upstream, Bruce Richardson

Hi Stanislaw,

On 27/05/2022 12:16, Stanisław Kardach wrote:
> On Tue, May 24, 2022 at 6:28 PM Stanisław Kardach <kda@semihalf.com> wrote:
> <snip>
>> That said I wonder why do we have different const requirements for
>> rte_lpm_lookup() and rte_lpm_lookupx4():
>>    static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip,
>> uint32_t *next_hop)
>>    static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t
>> ip, uint32_t hop[4], uint32_t defv);
>> I think both should be const.
>>
> To re-iterate the question, should I also post a patch for changing
> rte_lpm_lookup() to add "const" to "struct rte_lpm *lpm" argument?
> rte_lpm_lookup_bulk_func() and rte_lpm_lookupx4() already take lpm as
> const.
> I'm pushing because otherwise I get a const discard warning in the
> scalar version of rte_lpm_lookupx4() utilizing rte_lpm_lookup().

Since these are inline functions, there will be no problems with the 
ABI/API, so please add const to the *lpm argument.

Thanks!

> 
> Best Regards,
> Stanislaw Kardach

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 1/2] lpm: add const to lpm arg of rte_lpm_lookup
  2022-05-10 11:58 [PATCH 1/1] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
  2022-05-19 17:02 ` Medvedkin, Vladimir
@ 2022-05-27 18:18 ` Stanislaw Kardach
  2022-05-27 18:18   ` [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
  2022-05-30 18:24   ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  1 sibling, 2 replies; 19+ messages in thread
From: Stanislaw Kardach @ 2022-05-27 18:18 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: Stanislaw Kardach, dev, Frank Zhao, Sam Grove, mw, upstream

All other rte_lpm_lookup* functions take lpm argument as a const. As the
basic rte_lpm_lookup() performs the same function, it should also do
that.

As this function is inline, no API/ABI change happens.

Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
---
 lib/lpm/rte_lpm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index eb91960e81..1cf863a146 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -279,7 +279,7 @@ rte_lpm_delete_all(struct rte_lpm *lpm);
  *   -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit
  */
 static inline int
-rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
+rte_lpm_lookup(const struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
 {
 	unsigned tbl24_index = (ip >> 8);
 	uint32_t tbl_entry;
-- 
2.30.2

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-27 18:18 ` [PATCH v2 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
@ 2022-05-27 18:18   ` Stanislaw Kardach
  2022-05-27 20:15     ` Stephen Hemminger
  2022-05-30 18:24   ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  1 sibling, 1 reply; 19+ messages in thread
From: Stanislaw Kardach @ 2022-05-27 18:18 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, mw, upstream,
	Stanislaw Kardach

From: Michal Mazurek <maz@semihalf.com>

Add an implementation of the rte_lpm_lookupx4() function for platforms
without support for vector operations.

This will be useful in the upcoming RISC-V port as well as any platform
which may want to start with a basic level of LPM support.

Signed-off-by: Michal Mazurek <maz@semihalf.com>
Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
---
 doc/guides/rel_notes/release_22_07.rst |  5 ++++
 lib/lpm/meson.build                    |  1 +
 lib/lpm/rte_lpm.h                      |  4 ++-
 lib/lpm/rte_lpm_scalar.h               | 36 ++++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 lib/lpm/rte_lpm_scalar.h

diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
index e49cacecef..0cf3f71269 100644
--- a/doc/guides/rel_notes/release_22_07.rst
+++ b/doc/guides/rel_notes/release_22_07.rst
@@ -104,6 +104,11 @@ New Features
   * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
   * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
 
+* **Added scalar version of the LPM library.**
+
+  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
+    implementation for platforms that don't support vector operations.
+
 
 Removed Items
 -------------
diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
index 78d91d3421..6b47361fce 100644
--- a/lib/lpm/meson.build
+++ b/lib/lpm/meson.build
@@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
 indirect_headers += files(
         'rte_lpm_altivec.h',
         'rte_lpm_neon.h',
+        'rte_lpm_scalar.h',
         'rte_lpm_sse.h',
         'rte_lpm_sve.h',
 )
diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index 1cf863a146..4f38864fde 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #endif
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
-#else
+#elif defined(RTE_ARCH_X86)
 #include "rte_lpm_sse.h"
+#else
+#include "rte_lpm_scalar.h"
 #endif
 
 #ifdef __cplusplus
diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
new file mode 100644
index 0000000000..2fc0e19161
--- /dev/null
+++ b/lib/lpm/rte_lpm_scalar.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 StarFive
+ * Copyright(c) 2022 SiFive
+ * Copyright(c) 2022 Semihalf
+ */
+
+#ifndef _RTE_LPM_SCALAR_H_
+#define _RTE_LPM_SCALAR_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+		uint32_t defv)
+{
+	uint32_t nh;
+	int i, ret;
+
+	for (i = 0; i < 4; i++) {
+		ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
+		hop[i] = (ret == 0) ? nh : defv;
+	}
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_SCALAR_H_ */
-- 
2.30.2

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-27 18:18   ` [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
@ 2022-05-27 20:15     ` Stephen Hemminger
  2022-05-30  7:52       ` Bruce Richardson
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2022-05-27 20:15 UTC (permalink / raw)
  To: Stanislaw Kardach
  Cc: Vladimir Medvedkin, Michal Mazurek, dev, Frank Zhao, Sam Grove,
	mw, upstream

On Fri, 27 May 2022 20:18:22 +0200
Stanislaw Kardach <kda@semihalf.com> wrote:

> +static inline void
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +		uint32_t defv)
> +{
> +	uint32_t nh;
> +	int i, ret;
> +
> +	for (i = 0; i < 4; i++) {
> +		ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
> +		hop[i] = (ret == 0) ? nh : defv;
> +	}
> +}

For performance, manually unroll the loop.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-27 20:15     ` Stephen Hemminger
@ 2022-05-30  7:52       ` Bruce Richardson
  2022-05-30  8:00         ` Morten Brørup
  0 siblings, 1 reply; 19+ messages in thread
From: Bruce Richardson @ 2022-05-30  7:52 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Stanislaw Kardach, Vladimir Medvedkin, Michal Mazurek, dev,
	Frank Zhao, Sam Grove, mw, upstream

On Fri, May 27, 2022 at 01:15:20PM -0700, Stephen Hemminger wrote:
> On Fri, 27 May 2022 20:18:22 +0200
> Stanislaw Kardach <kda@semihalf.com> wrote:
> 
> > +static inline void
> > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +		uint32_t defv)
> > +{
> > +	uint32_t nh;
> > +	int i, ret;
> > +
> > +	for (i = 0; i < 4; i++) {
> > +		ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
> > +		hop[i] = (ret == 0) ? nh : defv;
> > +	}
> > +}
> 
> For performance, manually unroll the loop.

Given a constant 4x iterations, will compilers not unroll this
automatically. I think the loop is a little clearer if it can be kept

/Bruce

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-30  7:52       ` Bruce Richardson
@ 2022-05-30  8:00         ` Morten Brørup
  2022-05-30 10:42           ` Bruce Richardson
  0 siblings, 1 reply; 19+ messages in thread
From: Morten Brørup @ 2022-05-30  8:00 UTC (permalink / raw)
  To: Bruce Richardson, Stephen Hemminger
  Cc: Stanislaw Kardach, Vladimir Medvedkin, Michal Mazurek, dev,
	Frank Zhao, Sam Grove, mw, upstream

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Monday, 30 May 2022 09.52
> 
> On Fri, May 27, 2022 at 01:15:20PM -0700, Stephen Hemminger wrote:
> > On Fri, 27 May 2022 20:18:22 +0200
> > Stanislaw Kardach <kda@semihalf.com> wrote:
> >
> > > +static inline void
> > > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> hop[4],
> > > +		uint32_t defv)
> > > +{
> > > +	uint32_t nh;
> > > +	int i, ret;
> > > +
> > > +	for (i = 0; i < 4; i++) {
> > > +		ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
> > > +		hop[i] = (ret == 0) ? nh : defv;
> > > +	}
> > > +}
> >
> > For performance, manually unroll the loop.
> 
> Given a constant 4x iterations, will compilers not unroll this
> automatically. I think the loop is a little clearer if it can be kept
> 
> /Bruce

If in doubt, add this and look at the assembler output:

#define REVIEW_INLINE_FUNCTIONS 1

#if REVIEW_INLINE_FUNCTIONS /* For compiler output review purposes only. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmissing-prototypes"
void review_rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
{
	rte_lpm_lookupx4(lpm, ip, hop, defv);
}
#pragma GCC diagnostic pop
#endif /* REVIEW_INLINE_FUNCTIONS */



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-30  8:00         ` Morten Brørup
@ 2022-05-30 10:42           ` Bruce Richardson
  2022-05-30 11:20             ` Stanisław Kardach
  0 siblings, 1 reply; 19+ messages in thread
From: Bruce Richardson @ 2022-05-30 10:42 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Stephen Hemminger, Stanislaw Kardach, Vladimir Medvedkin,
	Michal Mazurek, dev, Frank Zhao, Sam Grove, mw, upstream

On Mon, May 30, 2022 at 10:00:34AM +0200, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Monday, 30 May 2022 09.52
> > 
> > On Fri, May 27, 2022 at 01:15:20PM -0700, Stephen Hemminger wrote:
> > > On Fri, 27 May 2022 20:18:22 +0200
> > > Stanislaw Kardach <kda@semihalf.com> wrote:
> > >
> > > > +static inline void
> > > > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> > hop[4],
> > > > +		uint32_t defv)
> > > > +{
> > > > +	uint32_t nh;
> > > > +	int i, ret;
> > > > +
> > > > +	for (i = 0; i < 4; i++) {
> > > > +		ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
> > > > +		hop[i] = (ret == 0) ? nh : defv;
> > > > +	}
> > > > +}
> > >
> > > For performance, manually unroll the loop.
> > 
> > Given a constant 4x iterations, will compilers not unroll this
> > automatically. I think the loop is a little clearer if it can be kept
> > 
> > /Bruce
> 
> If in doubt, add this and look at the assembler output:
> 
> #define REVIEW_INLINE_FUNCTIONS 1
> 
> #if REVIEW_INLINE_FUNCTIONS /* For compiler output review purposes only. */
> #pragma GCC diagnostic push
> #pragma GCC diagnostic ignored "-Wmissing-prototypes"
> void review_rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
> {
> 	rte_lpm_lookupx4(lpm, ip, hop, defv);
> }
> #pragma GCC diagnostic pop
> #endif /* REVIEW_INLINE_FUNCTIONS */
> 

Used godbolt.org to check and indeed the function is not unrolled.
(Gcc 11.2, with flags "-O3 -march=icelake-server").

Manually unrolling changes the assembly generated in interesting ways. For
example, it appears to generate more cmov-type instructions for the
miss/default-value case rather than using branches as in the looped
version. Whether this is better or not may depend upon usecase - if one
expects most lpm lookup entries to hit, then having (predictable) branches
may well be cheaper.

In any case, I'll withdraw any object to unrolling, but I'm still not
convinced it's necessary.

/Bruce

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-30 10:42           ` Bruce Richardson
@ 2022-05-30 11:20             ` Stanisław Kardach
  2022-05-30 12:46               ` Bruce Richardson
  0 siblings, 1 reply; 19+ messages in thread
From: Stanisław Kardach @ 2022-05-30 11:20 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Morten Brørup, Stephen Hemminger, Vladimir Medvedkin,
	Michal Mazurek, dev, Frank Zhao, Sam Grove, Marcin Wojtas,
	upstream

On Mon, May 30, 2022 at 12:42 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, May 30, 2022 at 10:00:34AM +0200, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Monday, 30 May 2022 09.52
> > >
> > > On Fri, May 27, 2022 at 01:15:20PM -0700, Stephen Hemminger wrote:
> > > > On Fri, 27 May 2022 20:18:22 +0200
> > > > Stanislaw Kardach <kda@semihalf.com> wrote:
> > > >
> > > > > +static inline void
> > > > > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> > > hop[4],
> > > > > +               uint32_t defv)
> > > > > +{
> > > > > +       uint32_t nh;
> > > > > +       int i, ret;
> > > > > +
> > > > > +       for (i = 0; i < 4; i++) {
> > > > > +               ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
> > > > > +               hop[i] = (ret == 0) ? nh : defv;
> > > > > +       }
> > > > > +}
> > > >
> > > > For performance, manually unroll the loop.
> > >
> > > Given a constant 4x iterations, will compilers not unroll this
> > > automatically. I think the loop is a little clearer if it can be kept
> > >
> > > /Bruce
> >
> > If in doubt, add this and look at the assembler output:
> >
> > #define REVIEW_INLINE_FUNCTIONS 1
> >
> > #if REVIEW_INLINE_FUNCTIONS /* For compiler output review purposes only. */
> > #pragma GCC diagnostic push
> > #pragma GCC diagnostic ignored "-Wmissing-prototypes"
> > void review_rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
> > {
> >       rte_lpm_lookupx4(lpm, ip, hop, defv);
> > }
> > #pragma GCC diagnostic pop
> > #endif /* REVIEW_INLINE_FUNCTIONS */
> >
>
> Used godbolt.org to check and indeed the function is not unrolled.
> (Gcc 11.2, with flags "-O3 -march=icelake-server").
>
> Manually unrolling changes the assembly generated in interesting ways. For
> example, it appears to generate more cmov-type instructions for the
> miss/default-value case rather than using branches as in the looped
> version. Whether this is better or not may depend upon usecase - if one
> expects most lpm lookup entries to hit, then having (predictable) branches
> may well be cheaper.
>
> In any case, I'll withdraw any object to unrolling, but I'm still not
> convinced it's necessary.
>
> /Bruce
Interestingly enough until I've defined unlikely() in godbolt, I did
not get any automatic unrolling on godbolt (either with x86 or RISC-V
GCC). Did you get any compilation warnings?
That said it only happens on O3 since it implies -fpeel-loops. O3 is
the default for DPDK.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-30 11:20             ` Stanisław Kardach
@ 2022-05-30 12:46               ` Bruce Richardson
  0 siblings, 0 replies; 19+ messages in thread
From: Bruce Richardson @ 2022-05-30 12:46 UTC (permalink / raw)
  To: Stanisław Kardach
  Cc: Morten Brørup, Stephen Hemminger, Vladimir Medvedkin,
	Michal Mazurek, dev, Frank Zhao, Sam Grove, Marcin Wojtas,
	upstream

On Mon, May 30, 2022 at 01:20:50PM +0200, Stanisław Kardach wrote:
> On Mon, May 30, 2022 at 12:42 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Mon, May 30, 2022 at 10:00:34AM +0200, Morten Brørup wrote:
> > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > Sent: Monday, 30 May 2022 09.52
> > > >
> > > > On Fri, May 27, 2022 at 01:15:20PM -0700, Stephen Hemminger wrote:
> > > > > On Fri, 27 May 2022 20:18:22 +0200
> > > > > Stanislaw Kardach <kda@semihalf.com> wrote:
> > > > >
> > > > > > +static inline void
> > > > > > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> > > > hop[4],
> > > > > > +               uint32_t defv)
> > > > > > +{
> > > > > > +       uint32_t nh;
> > > > > > +       int i, ret;
> > > > > > +
> > > > > > +       for (i = 0; i < 4; i++) {
> > > > > > +               ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[i], &nh);
> > > > > > +               hop[i] = (ret == 0) ? nh : defv;
> > > > > > +       }
> > > > > > +}
> > > > >
> > > > > For performance, manually unroll the loop.
> > > >
> > > > Given a constant 4x iterations, will compilers not unroll this
> > > > automatically. I think the loop is a little clearer if it can be kept
> > > >
> > > > /Bruce
> > >
> > > If in doubt, add this and look at the assembler output:
> > >
> > > #define REVIEW_INLINE_FUNCTIONS 1
> > >
> > > #if REVIEW_INLINE_FUNCTIONS /* For compiler output review purposes only. */
> > > #pragma GCC diagnostic push
> > > #pragma GCC diagnostic ignored "-Wmissing-prototypes"
> > > void review_rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv)
> > > {
> > >       rte_lpm_lookupx4(lpm, ip, hop, defv);
> > > }
> > > #pragma GCC diagnostic pop
> > > #endif /* REVIEW_INLINE_FUNCTIONS */
> > >
> >
> > Used godbolt.org to check and indeed the function is not unrolled.
> > (Gcc 11.2, with flags "-O3 -march=icelake-server").
> >
> > Manually unrolling changes the assembly generated in interesting ways. For
> > example, it appears to generate more cmov-type instructions for the
> > miss/default-value case rather than using branches as in the looped
> > version. Whether this is better or not may depend upon usecase - if one
> > expects most lpm lookup entries to hit, then having (predictable) branches
> > may well be cheaper.
> >
> > In any case, I'll withdraw any object to unrolling, but I'm still not
> > convinced it's necessary.
> >
> > /Bruce
> Interestingly enough until I've defined unlikely() in godbolt, I did
> not get any automatic unrolling on godbolt (either with x86 or RISC-V
> GCC). Did you get any compilation warnings?

That matches what I saw. I then just used manual unrolling i.e. copy-paste
the 2 lines 4 times, to see what the output was like then.

> That said it only happens on O3 since it implies -fpeel-loops. O3 is
> the default for DPDK.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup
  2022-05-27 18:18 ` [PATCH v2 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  2022-05-27 18:18   ` [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
@ 2022-05-30 18:24   ` Stanislaw Kardach
  2022-05-30 18:24     ` [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
                       ` (2 more replies)
  1 sibling, 3 replies; 19+ messages in thread
From: Stanislaw Kardach @ 2022-05-30 18:24 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: Stanislaw Kardach, dev, Frank Zhao, Sam Grove, mw, upstream

All other rte_lpm_lookup* functions take lpm argument as a const. As the
basic rte_lpm_lookup() performs the same function, it should also do
that.

As this function is inline, no API/ABI change happens.

Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
---
 lib/lpm/rte_lpm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index eb91960e81..1cf863a146 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -279,7 +279,7 @@ rte_lpm_delete_all(struct rte_lpm *lpm);
  *   -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit
  */
 static inline int
-rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
+rte_lpm_lookup(const struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
 {
 	unsigned tbl24_index = (ip >> 8);
 	uint32_t tbl_entry;
-- 
2.30.2

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-30 18:24   ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
@ 2022-05-30 18:24     ` Stanislaw Kardach
  2022-06-01  9:41       ` Medvedkin, Vladimir
  2022-05-30 20:38     ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stephen Hemminger
  2022-06-01  9:35     ` Medvedkin, Vladimir
  2 siblings, 1 reply; 19+ messages in thread
From: Stanislaw Kardach @ 2022-05-30 18:24 UTC (permalink / raw)
  To: Vladimir Medvedkin
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, mw, upstream,
	Stanislaw Kardach

From: Michal Mazurek <maz@semihalf.com>

Add an implementation of the rte_lpm_lookupx4() function for platforms
without support for vector operations.

This will be useful in the upcoming RISC-V port as well as any platform
which may want to start with a basic level of LPM support.

Signed-off-by: Michal Mazurek <maz@semihalf.com>
Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
---
 doc/guides/rel_notes/release_22_07.rst |  5 ++++
 lib/lpm/meson.build                    |  1 +
 lib/lpm/rte_lpm.h                      |  4 ++-
 lib/lpm/rte_lpm_scalar.h               | 40 ++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 lib/lpm/rte_lpm_scalar.h

diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
index e49cacecef..0cf3f71269 100644
--- a/doc/guides/rel_notes/release_22_07.rst
+++ b/doc/guides/rel_notes/release_22_07.rst
@@ -104,6 +104,11 @@ New Features
   * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
   * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
 
+* **Added scalar version of the LPM library.**
+
+  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
+    implementation for platforms that don't support vector operations.
+
 
 Removed Items
 -------------
diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
index 78d91d3421..6b47361fce 100644
--- a/lib/lpm/meson.build
+++ b/lib/lpm/meson.build
@@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
 indirect_headers += files(
         'rte_lpm_altivec.h',
         'rte_lpm_neon.h',
+        'rte_lpm_scalar.h',
         'rte_lpm_sse.h',
         'rte_lpm_sve.h',
 )
diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index 1cf863a146..4f38864fde 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 #endif
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
-#else
+#elif defined(RTE_ARCH_X86)
 #include "rte_lpm_sse.h"
+#else
+#include "rte_lpm_scalar.h"
 #endif
 
 #ifdef __cplusplus
diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
new file mode 100644
index 0000000000..4ae1b6f0b8
--- /dev/null
+++ b/lib/lpm/rte_lpm_scalar.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 StarFive
+ * Copyright(c) 2022 SiFive
+ * Copyright(c) 2022 Semihalf
+ */
+
+#ifndef _RTE_LPM_SCALAR_H_
+#define _RTE_LPM_SCALAR_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+		uint32_t defv)
+{
+	uint32_t nh;
+	int ret;
+
+	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[0], &nh);
+	hop[0] = (ret == 0) ? nh : defv;
+	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[1], &nh);
+	hop[1] = (ret == 0) ? nh : defv;
+	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[2], &nh);
+	hop[2] = (ret == 0) ? nh : defv;
+	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[3], &nh);
+	hop[3] = (ret == 0) ? nh : defv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_SCALAR_H_ */
-- 
2.30.2

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function
  2022-05-30 18:24     ` [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
@ 2022-06-01  9:41       ` Medvedkin, Vladimir
  2022-06-01 10:32         ` Stanisław Kardach
  0 siblings, 1 reply; 19+ messages in thread
From: Medvedkin, Vladimir @ 2022-06-01  9:41 UTC (permalink / raw)
  To: Stanislaw Kardach
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, mw, upstream

Hi Stanislaw,


On 30/05/2022 19:24, Stanislaw Kardach wrote:
> From: Michal Mazurek <maz@semihalf.com>
> 
> Add an implementation of the rte_lpm_lookupx4() function for platforms
> without support for vector operations.
> 
> This will be useful in the upcoming RISC-V port as well as any platform
> which may want to start with a basic level of LPM support.
> 
> Signed-off-by: Michal Mazurek <maz@semihalf.com>
> Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
> ---
>   doc/guides/rel_notes/release_22_07.rst |  5 ++++
>   lib/lpm/meson.build                    |  1 +
>   lib/lpm/rte_lpm.h                      |  4 ++-
>   lib/lpm/rte_lpm_scalar.h               | 40 ++++++++++++++++++++++++++
>   4 files changed, 49 insertions(+), 1 deletion(-)
>   create mode 100644 lib/lpm/rte_lpm_scalar.h
> 
> diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> index e49cacecef..0cf3f71269 100644
> --- a/doc/guides/rel_notes/release_22_07.rst
> +++ b/doc/guides/rel_notes/release_22_07.rst
> @@ -104,6 +104,11 @@ New Features
>     * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
>     * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
>   
> +* **Added scalar version of the LPM library.**
> +
> +  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
> +    implementation for platforms that don't support vector operations.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
> index 78d91d3421..6b47361fce 100644
> --- a/lib/lpm/meson.build
> +++ b/lib/lpm/meson.build
> @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
>   indirect_headers += files(
>           'rte_lpm_altivec.h',
>           'rte_lpm_neon.h',
> +        'rte_lpm_scalar.h',
>           'rte_lpm_sse.h',
>           'rte_lpm_sve.h',
>   )
> diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
> index 1cf863a146..4f38864fde 100644
> --- a/lib/lpm/rte_lpm.h
> +++ b/lib/lpm/rte_lpm.h
> @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   #endif
>   #elif defined(RTE_ARCH_PPC_64)
>   #include "rte_lpm_altivec.h"
> -#else
> +#elif defined(RTE_ARCH_X86)
>   #include "rte_lpm_sse.h"
> +#else
> +#include "rte_lpm_scalar.h"
>   #endif
>   
>   #ifdef __cplusplus
> diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
> new file mode 100644
> index 0000000000..4ae1b6f0b8
> --- /dev/null
> +++ b/lib/lpm/rte_lpm_scalar.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 StarFive
> + * Copyright(c) 2022 SiFive
> + * Copyright(c) 2022 Semihalf
> + */
> +
> +#ifndef _RTE_LPM_SCALAR_H_
> +#define _RTE_LPM_SCALAR_H_
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_byteorder.h>
> +#include <rte_common.h>

Just a one nit, I think these 3 headers are not needed and can be 
removed. Apart from it looks good to me.
Thanks!

> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +static inline void
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +		uint32_t defv)
> +{
> +	uint32_t nh;
> +	int ret;
> +
> +	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[0], &nh);
> +	hop[0] = (ret == 0) ? nh : defv;
> +	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[1], &nh);
> +	hop[1] = (ret == 0) ? nh : defv;
> +	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[2], &nh);
> +	hop[2] = (ret == 0) ? nh : defv;
> +	ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[3], &nh);
> +	hop[3] = (ret == 0) ? nh : defv;
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_LPM_SCALAR_H_ */

Acked-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function
  2022-06-01  9:41       ` Medvedkin, Vladimir
@ 2022-06-01 10:32         ` Stanisław Kardach
  0 siblings, 0 replies; 19+ messages in thread
From: Stanisław Kardach @ 2022-06-01 10:32 UTC (permalink / raw)
  To: Medvedkin, Vladimir
  Cc: Michal Mazurek, dev, Frank Zhao, Sam Grove, Marcin Wojtas, upstream

Hi Vladimir,

On Wed, Jun 1, 2022 at 11:41 AM Medvedkin, Vladimir
<vladimir.medvedkin@intel.com> wrote:
>
> Hi Stanislaw,
>
>
> On 30/05/2022 19:24, Stanislaw Kardach wrote:
> > From: Michal Mazurek <maz@semihalf.com>
> >
> > Add an implementation of the rte_lpm_lookupx4() function for platforms
> > without support for vector operations.
> >
> > This will be useful in the upcoming RISC-V port as well as any platform
> > which may want to start with a basic level of LPM support.
> >
> > Signed-off-by: Michal Mazurek <maz@semihalf.com>
> > Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
> > ---
> >   doc/guides/rel_notes/release_22_07.rst |  5 ++++
> >   lib/lpm/meson.build                    |  1 +
> >   lib/lpm/rte_lpm.h                      |  4 ++-
> >   lib/lpm/rte_lpm_scalar.h               | 40 ++++++++++++++++++++++++++
> >   4 files changed, 49 insertions(+), 1 deletion(-)
> >   create mode 100644 lib/lpm/rte_lpm_scalar.h
> >
> > diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> > index e49cacecef..0cf3f71269 100644
> > --- a/doc/guides/rel_notes/release_22_07.rst
> > +++ b/doc/guides/rel_notes/release_22_07.rst
> > @@ -104,6 +104,11 @@ New Features
> >     * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
> >     * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
> >
> > +* **Added scalar version of the LPM library.**
> > +
> > +  * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back
> > +    implementation for platforms that don't support vector operations.
> > +
> >
> >   Removed Items
> >   -------------
> > diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
> > index 78d91d3421..6b47361fce 100644
> > --- a/lib/lpm/meson.build
> > +++ b/lib/lpm/meson.build
> > @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
> >   indirect_headers += files(
> >           'rte_lpm_altivec.h',
> >           'rte_lpm_neon.h',
> > +        'rte_lpm_scalar.h',
> >           'rte_lpm_sse.h',
> >           'rte_lpm_sve.h',
> >   )
> > diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
> > index 1cf863a146..4f38864fde 100644
> > --- a/lib/lpm/rte_lpm.h
> > +++ b/lib/lpm/rte_lpm.h
> > @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >   #endif
> >   #elif defined(RTE_ARCH_PPC_64)
> >   #include "rte_lpm_altivec.h"
> > -#else
> > +#elif defined(RTE_ARCH_X86)
> >   #include "rte_lpm_sse.h"
> > +#else
> > +#include "rte_lpm_scalar.h"
> >   #endif
> >
> >   #ifdef __cplusplus
> > diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h
> > new file mode 100644
> > index 0000000000..4ae1b6f0b8
> > --- /dev/null
> > +++ b/lib/lpm/rte_lpm_scalar.h
> > @@ -0,0 +1,40 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 StarFive
> > + * Copyright(c) 2022 SiFive
> > + * Copyright(c) 2022 Semihalf
> > + */
> > +
> > +#ifndef _RTE_LPM_SCALAR_H_
> > +#define _RTE_LPM_SCALAR_H_
> > +
> > +#include <rte_branch_prediction.h>
> > +#include <rte_byteorder.h>
> > +#include <rte_common.h>
>
> Just a one nit, I think these 3 headers are not needed and can be
> removed. Apart from it looks good to me.
> Thanks!
Thanks for catching this. I'll send a followup right away.
>
> > +#include <rte_vect.h>
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +static inline void
> > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +             uint32_t defv)
> > +{
> > +     uint32_t nh;
> > +     int ret;
> > +
> > +     ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[0], &nh);
> > +     hop[0] = (ret == 0) ? nh : defv;
> > +     ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[1], &nh);
> > +     hop[1] = (ret == 0) ? nh : defv;
> > +     ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[2], &nh);
> > +     hop[2] = (ret == 0) ? nh : defv;
> > +     ret = rte_lpm_lookup(lpm, ((rte_xmm_t)ip).u32[3], &nh);
> > +     hop[3] = (ret == 0) ? nh : defv;
> > +}
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_LPM_SCALAR_H_ */
>
> Acked-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>
> --
> Regards,
> Vladimir

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup
  2022-05-30 18:24   ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  2022-05-30 18:24     ` [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
@ 2022-05-30 20:38     ` Stephen Hemminger
  2022-06-01  9:35     ` Medvedkin, Vladimir
  2 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2022-05-30 20:38 UTC (permalink / raw)
  To: Stanislaw Kardach
  Cc: Vladimir Medvedkin, dev, Frank Zhao, Sam Grove, mw, upstream

On Mon, 30 May 2022 20:24:36 +0200
Stanislaw Kardach <kda@semihalf.com> wrote:

> All other rte_lpm_lookup* functions take lpm argument as a const. As the
> basic rte_lpm_lookup() performs the same function, it should also do
> that.
> 
> As this function is inline, no API/ABI change happens.
> 
> Signed-off-by: Stanislaw Kardach <kda@semihalf.com>

Acked-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup
  2022-05-30 18:24   ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
  2022-05-30 18:24     ` [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
  2022-05-30 20:38     ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stephen Hemminger
@ 2022-06-01  9:35     ` Medvedkin, Vladimir
  2 siblings, 0 replies; 19+ messages in thread
From: Medvedkin, Vladimir @ 2022-06-01  9:35 UTC (permalink / raw)
  To: Stanislaw Kardach; +Cc: dev, Frank Zhao, Sam Grove, mw, upstream



On 30/05/2022 19:24, Stanislaw Kardach wrote:
> All other rte_lpm_lookup* functions take lpm argument as a const. As the
> basic rte_lpm_lookup() performs the same function, it should also do
> that.
> 
> As this function is inline, no API/ABI change happens.
> 
> Signed-off-by: Stanislaw Kardach <kda@semihalf.com>
> ---
>   lib/lpm/rte_lpm.h | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
> index eb91960e81..1cf863a146 100644
> --- a/lib/lpm/rte_lpm.h
> +++ b/lib/lpm/rte_lpm.h
> @@ -279,7 +279,7 @@ rte_lpm_delete_all(struct rte_lpm *lpm);
>    *   -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit
>    */
>   static inline int
> -rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
> +rte_lpm_lookup(const struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
>   {
>   	unsigned tbl24_index = (ip >> 8);
>   	uint32_t tbl_entry;

Acked-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

-- 
Regards,
Vladimir

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2022-06-01 10:33 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-10 11:58 [PATCH 1/1] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
2022-05-19 17:02 ` Medvedkin, Vladimir
2022-05-24 16:28   ` Stanisław Kardach
2022-05-27 11:16     ` Stanisław Kardach
2022-05-27 13:16       ` Medvedkin, Vladimir
2022-05-27 18:18 ` [PATCH v2 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
2022-05-27 18:18   ` [PATCH v2 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
2022-05-27 20:15     ` Stephen Hemminger
2022-05-30  7:52       ` Bruce Richardson
2022-05-30  8:00         ` Morten Brørup
2022-05-30 10:42           ` Bruce Richardson
2022-05-30 11:20             ` Stanisław Kardach
2022-05-30 12:46               ` Bruce Richardson
2022-05-30 18:24   ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stanislaw Kardach
2022-05-30 18:24     ` [PATCH v3 2/2] lpm: add a scalar version of lookupx4 function Stanislaw Kardach
2022-06-01  9:41       ` Medvedkin, Vladimir
2022-06-01 10:32         ` Stanisław Kardach
2022-05-30 20:38     ` [PATCH v3 1/2] lpm: add const to lpm arg of rte_lpm_lookup Stephen Hemminger
2022-06-01  9:35     ` Medvedkin, Vladimir

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).