From: Pavan Nikhilesh <pbhagavatula@marvell.com> Add lookup x4 with x4 default values. This can be used in usecases where we have to process burst of packets from different ports. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> --- app/test/test_lpm_perf.c | 31 +++++++++ lib/librte_lpm/rte_lpm.h | 23 +++++++ lib/librte_lpm/rte_lpm_altivec.h | 109 +++++++++++++++++++++++++++++++ lib/librte_lpm/rte_lpm_neon.h | 102 +++++++++++++++++++++++++++++ lib/librte_lpm/rte_lpm_sse.h | 104 +++++++++++++++++++++++++++++ 5 files changed, 369 insertions(+) diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c index a2578fe90..8e9d4c7eb 100644 --- a/app/test/test_lpm_perf.c +++ b/app/test/test_lpm_perf.c @@ -460,6 +460,37 @@ test_lpm_perf(void) (double)total_time / ((double)ITERATIONS * BATCH_SIZE), (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE)); + /* Measure LookupX4 DefaultX4 */ + total_time = 0; + count = 0; + uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX}; + for (i = 0; i < ITERATIONS; i++) { + static uint32_t ip_batch[BATCH_SIZE]; + uint32_t next_hops[4]; + + /* Create array of random IP addresses */ + for (j = 0; j < BATCH_SIZE; j++) + ip_batch[j] = rte_rand(); + + /* Lookup per batch */ + begin = rte_rdtsc(); + for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) { + unsigned int k; + xmm_t ipx4; + + ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j)); + ipx4 = *(xmm_t *)(ip_batch + j); + rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops, def); + for (k = 0; k < RTE_DIM(next_hops); k++) + if (unlikely(next_hops[k] == UINT32_MAX)) + count++; + } + + total_time += rte_rdtsc() - begin; + } + printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n", + (double)total_time / ((double)ITERATIONS * BATCH_SIZE), + (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE)); /* Measure Delete */ status = 0; begin = rte_rdtsc(); diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index b9d49ac87..e66b43e06 100644 --- a/lib/librte_lpm/rte_lpm.h +++ b/lib/librte_lpm/rte_lpm.h @@ -370,6 +370,29 @@ static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv); +/** + * Lookup four IP addresses in an LPM table. + * + * @param lpm + * LPM object handle + * @param ip + * Four IPs to be looked up in the LPM table + * @param hop + * Next hop of the most specific rule found for IP (valid on lookup hit only). + * This is an 4 elements array of two byte values. + * If the lookup was successful for the given IP, then least significant byte + * of the corresponding element is the actual next hop and the most + * significant byte is zero. + * If the lookup for the given IP failed, then corresponding element would + * contain default value, see description of then next parameter. + * @param defv + * Default value[] to populate into corresponding element of hop[] array, + * if lookup would fail. + */ +static inline void +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv[4]); + #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) #include "rte_lpm_neon.h" #elif defined(RTE_ARCH_PPC_64) diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h index 228c41b38..1afc7bd74 100644 --- a/lib/librte_lpm/rte_lpm_altivec.h +++ b/lib/librte_lpm/rte_lpm_altivec.h @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; } +static inline void +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv[4]) +{ + vector signed int i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t idx, pt, pt2; + const uint32_t *ptbl; + + const uint32_t mask = UINT8_MAX; + const vector signed int mask8 = (xmm_t){mask, mask, mask, mask}; + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24 = vec_sr((xmm_t) ip, + (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT}); + + /* extract values from tbl24[] */ + idx = (uint32_t)i24[0]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[0] = *ptbl; + + idx = (uint32_t) i24[1]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[1] = *ptbl; + + idx = (uint32_t) i24[2]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[2] = *ptbl; + + idx = (uint32_t) i24[3]; + idx = idx < (1<<24) ? idx : (1<<24)-1; + ptbl = (const uint32_t *)&lpm->tbl24[idx]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = vec_and(ip, mask8); + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : + defv[0]; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : + defv[1]; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : + defv[2]; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : + defv[3]; +} + #ifdef __cplusplus } #endif diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h index 6c131d312..6ef635b18 100644 --- a/lib/librte_lpm/rte_lpm_neon.h +++ b/lib/librte_lpm/rte_lpm_neon.h @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; } +static inline void +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv[4]) +{ + uint32x4_t i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t idx, pt, pt2; + const uint32_t *ptbl; + + const uint32_t mask = UINT8_MAX; + const int32x4_t mask8 = vdupq_n_s32(mask); + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT); + + /* extract values from tbl24[] */ + idx = vgetq_lane_u64((uint64x2_t)i24, 0); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[0] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[1] = *ptbl; + + idx = vgetq_lane_u64((uint64x2_t)i24, 1); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[2] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = vandq_s32(ip, mask8); + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : + defv[0]; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : + defv[1]; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : + defv[2]; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : + defv[3]; +} + #ifdef __cplusplus } #endif diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h index 44770b6ff..6ef15816c 100644 --- a/lib/librte_lpm/rte_lpm_sse.h +++ b/lib/librte_lpm/rte_lpm_sse.h @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; } +static inline void +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv[4]) +{ + __m128i i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t idx, pt, pt2; + const uint32_t *ptbl; + + const __m128i mask8 = + _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX); + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24 = _mm_srli_epi32(ip, CHAR_BIT); + + /* extract values from tbl24[] */ + idx = _mm_cvtsi128_si64(i24); + /* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a constant */ + i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[0] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[1] = *ptbl; + + idx = _mm_cvtsi128_si64(i24); + + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; + tbl[2] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = _mm_and_si128(ip, mask8); + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : + defv[0]; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : + defv[1]; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : + defv[2]; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : + defv[3]; +} + #ifdef __cplusplus } #endif -- 2.17.1
Hi Pavan,
I don't think it is a good idea to add extra function because:
1) it is just a copy of an existing rte_lpm_lookupx4() except the last 4
ternary ops
2) What is a real world use case for that? Usually returned value is
used as an index in an array of next_hop structs.
3) You can have the same result by using special unused defv and
pcmpeqd/vpblendd on a hop[4] after lookup
On 11/01/2020 16:08, pbhagavatula@marvell.com wrote:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add lookup x4 with x4 default values.
> This can be used in usecases where we have to process burst of packets
> from different ports.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
> app/test/test_lpm_perf.c | 31 +++++++++
> lib/librte_lpm/rte_lpm.h | 23 +++++++
> lib/librte_lpm/rte_lpm_altivec.h | 109 +++++++++++++++++++++++++++++++
> lib/librte_lpm/rte_lpm_neon.h | 102 +++++++++++++++++++++++++++++
> lib/librte_lpm/rte_lpm_sse.h | 104 +++++++++++++++++++++++++++++
> 5 files changed, 369 insertions(+)
>
> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
> index a2578fe90..8e9d4c7eb 100644
> --- a/app/test/test_lpm_perf.c
> +++ b/app/test/test_lpm_perf.c
> @@ -460,6 +460,37 @@ test_lpm_perf(void)
> (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
> (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
>
> + /* Measure LookupX4 DefaultX4 */
> + total_time = 0;
> + count = 0;
> + uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
> + for (i = 0; i < ITERATIONS; i++) {
> + static uint32_t ip_batch[BATCH_SIZE];
> + uint32_t next_hops[4];
> +
> + /* Create array of random IP addresses */
> + for (j = 0; j < BATCH_SIZE; j++)
> + ip_batch[j] = rte_rand();
> +
> + /* Lookup per batch */
> + begin = rte_rdtsc();
> + for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
> + unsigned int k;
> + xmm_t ipx4;
> +
> + ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
> + ipx4 = *(xmm_t *)(ip_batch + j);
> + rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops, def);
> + for (k = 0; k < RTE_DIM(next_hops); k++)
> + if (unlikely(next_hops[k] == UINT32_MAX))
> + count++;
> + }
> +
> + total_time += rte_rdtsc() - begin;
> + }
> + printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n",
> + (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
> + (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
> /* Measure Delete */
> status = 0;
> begin = rte_rdtsc();
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index b9d49ac87..e66b43e06 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -370,6 +370,29 @@ static inline void
> rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> uint32_t defv);
>
> +/**
> + * Lookup four IP addresses in an LPM table.
> + *
> + * @param lpm
> + * LPM object handle
> + * @param ip
> + * Four IPs to be looked up in the LPM table
> + * @param hop
> + * Next hop of the most specific rule found for IP (valid on lookup hit only).
> + * This is an 4 elements array of two byte values.
> + * If the lookup was successful for the given IP, then least significant byte
> + * of the corresponding element is the actual next hop and the most
> + * significant byte is zero.
> + * If the lookup for the given IP failed, then corresponding element would
> + * contain default value, see description of then next parameter.
> + * @param defv
> + * Default value[] to populate into corresponding element of hop[] array,
> + * if lookup would fail.
> + */
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> + uint32_t defv[4]);
> +
> #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> #include "rte_lpm_neon.h"
> #elif defined(RTE_ARCH_PPC_64)
> diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
> index 228c41b38..1afc7bd74 100644
> --- a/lib/librte_lpm/rte_lpm_altivec.h
> +++ b/lib/librte_lpm/rte_lpm_altivec.h
> @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
> }
>
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> + uint32_t defv[4])
> +{
> + vector signed int i24;
> + rte_xmm_t i8;
> + uint32_t tbl[4];
> + uint64_t idx, pt, pt2;
> + const uint32_t *ptbl;
> +
> + const uint32_t mask = UINT8_MAX;
> + const vector signed int mask8 = (xmm_t){mask, mask, mask, mask};
> +
> + /*
> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> + * as one 64-bit value (0x0300000003000000).
> + */
> + const uint64_t mask_xv =
> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> + /*
> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> + * as one 64-bit value (0x0100000001000000).
> + */
> + const uint64_t mask_v =
> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> + /* get 4 indexes for tbl24[]. */
> + i24 = vec_sr((xmm_t) ip,
> + (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT});
> +
> + /* extract values from tbl24[] */
> + idx = (uint32_t)i24[0];
> + idx = idx < (1<<24) ? idx : (1<<24)-1;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx];
> + tbl[0] = *ptbl;
> +
> + idx = (uint32_t) i24[1];
> + idx = idx < (1<<24) ? idx : (1<<24)-1;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx];
> + tbl[1] = *ptbl;
> +
> + idx = (uint32_t) i24[2];
> + idx = idx < (1<<24) ? idx : (1<<24)-1;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx];
> + tbl[2] = *ptbl;
> +
> + idx = (uint32_t) i24[3];
> + idx = idx < (1<<24) ? idx : (1<<24)-1;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx];
> + tbl[3] = *ptbl;
> +
> + /* get 4 indexes for tbl8[]. */
> + i8.x = vec_and(ip, mask8);
> +
> + pt = (uint64_t)tbl[0] |
> + (uint64_t)tbl[1] << 32;
> + pt2 = (uint64_t)tbl[2] |
> + (uint64_t)tbl[3] << 32;
> +
> + /* search successfully finished for all 4 IP addresses. */
> + if (likely((pt & mask_xv) == mask_v) &&
> + likely((pt2 & mask_xv) == mask_v)) {
> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> + return;
> + }
> +
> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[0] = i8.u32[0] +
> + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> + tbl[0] = *ptbl;
> + }
> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[1] = i8.u32[1] +
> + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> + tbl[1] = *ptbl;
> + }
> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[2] = i8.u32[2] +
> + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> + tbl[2] = *ptbl;
> + }
> + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[3] = i8.u32[3] +
> + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> + tbl[3] = *ptbl;
> + }
> +
> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
> + defv[0];
> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
> + defv[1];
> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
> + defv[2];
> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
> + defv[3];
> +}
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
> index 6c131d312..6ef635b18 100644
> --- a/lib/librte_lpm/rte_lpm_neon.h
> +++ b/lib/librte_lpm/rte_lpm_neon.h
> @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
> }
>
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> + uint32_t defv[4])
> +{
> + uint32x4_t i24;
> + rte_xmm_t i8;
> + uint32_t tbl[4];
> + uint64_t idx, pt, pt2;
> + const uint32_t *ptbl;
> +
> + const uint32_t mask = UINT8_MAX;
> + const int32x4_t mask8 = vdupq_n_s32(mask);
> +
> + /*
> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> + * as one 64-bit value (0x0300000003000000).
> + */
> + const uint64_t mask_xv =
> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> + /*
> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> + * as one 64-bit value (0x0100000001000000).
> + */
> + const uint64_t mask_v =
> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> + /* get 4 indexes for tbl24[]. */
> + i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT);
> +
> + /* extract values from tbl24[] */
> + idx = vgetq_lane_u64((uint64x2_t)i24, 0);
> +
> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> + tbl[0] = *ptbl;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> + tbl[1] = *ptbl;
> +
> + idx = vgetq_lane_u64((uint64x2_t)i24, 1);
> +
> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> + tbl[2] = *ptbl;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> + tbl[3] = *ptbl;
> +
> + /* get 4 indexes for tbl8[]. */
> + i8.x = vandq_s32(ip, mask8);
> +
> + pt = (uint64_t)tbl[0] |
> + (uint64_t)tbl[1] << 32;
> + pt2 = (uint64_t)tbl[2] |
> + (uint64_t)tbl[3] << 32;
> +
> + /* search successfully finished for all 4 IP addresses. */
> + if (likely((pt & mask_xv) == mask_v) &&
> + likely((pt2 & mask_xv) == mask_v)) {
> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> + return;
> + }
> +
> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[0] = i8.u32[0] +
> + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> + tbl[0] = *ptbl;
> + }
> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[1] = i8.u32[1] +
> + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> + tbl[1] = *ptbl;
> + }
> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[2] = i8.u32[2] +
> + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> + tbl[2] = *ptbl;
> + }
> + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[3] = i8.u32[3] +
> + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> + tbl[3] = *ptbl;
> + }
> +
> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
> + defv[0];
> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
> + defv[1];
> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
> + defv[2];
> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
> + defv[3];
> +}
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> index 44770b6ff..6ef15816c 100644
> --- a/lib/librte_lpm/rte_lpm_sse.h
> +++ b/lib/librte_lpm/rte_lpm_sse.h
> @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
> }
>
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> + uint32_t defv[4])
> +{
> + __m128i i24;
> + rte_xmm_t i8;
> + uint32_t tbl[4];
> + uint64_t idx, pt, pt2;
> + const uint32_t *ptbl;
> +
> + const __m128i mask8 =
> + _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
> +
> + /*
> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> + * as one 64-bit value (0x0300000003000000).
> + */
> + const uint64_t mask_xv =
> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> + /*
> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> + * as one 64-bit value (0x0100000001000000).
> + */
> + const uint64_t mask_v =
> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> + /* get 4 indexes for tbl24[]. */
> + i24 = _mm_srli_epi32(ip, CHAR_BIT);
> +
> + /* extract values from tbl24[] */
> + idx = _mm_cvtsi128_si64(i24);
> + /* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a constant */
> + i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8);
> +
> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> + tbl[0] = *ptbl;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> + tbl[1] = *ptbl;
> +
> + idx = _mm_cvtsi128_si64(i24);
> +
> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> + tbl[2] = *ptbl;
> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> + tbl[3] = *ptbl;
> +
> + /* get 4 indexes for tbl8[]. */
> + i8.x = _mm_and_si128(ip, mask8);
> +
> + pt = (uint64_t)tbl[0] |
> + (uint64_t)tbl[1] << 32;
> + pt2 = (uint64_t)tbl[2] |
> + (uint64_t)tbl[3] << 32;
> +
> + /* search successfully finished for all 4 IP addresses. */
> + if (likely((pt & mask_xv) == mask_v) &&
> + likely((pt2 & mask_xv) == mask_v)) {
> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> + return;
> + }
> +
> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[0] = i8.u32[0] +
> + (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> + tbl[0] = *ptbl;
> + }
> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[1] = i8.u32[1] +
> + (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> + tbl[1] = *ptbl;
> + }
> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[2] = i8.u32[2] +
> + (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> + tbl[2] = *ptbl;
> + }
> + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> + i8.u32[3] = i8.u32[3] +
> + (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> + tbl[3] = *ptbl;
> + }
> +
> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
> + defv[0];
> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
> + defv[1];
> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
> + defv[2];
> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
> + defv[3];
> +}
> +
> #ifdef __cplusplus
> }
> #endif
--
Regards,
Vladimir
>-----Original Message----- >From: dev <dev-bounces@dpdk.org> On Behalf Of Medvedkin, >Vladimir >Sent: Monday, January 13, 2020 4:37 PM >To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin >Jacob Kollanukkaran <jerinj@marvell.com>; Bruce Richardson ><bruce.richardson@intel.com>; Gavin Hu <gavin.hu@arm.com> >Cc: dev@dpdk.org >Subject: Re: [dpdk-dev] [PATCH] lmp: add lookup x4 with x4 default >values > >Hi Pavan, > Hi Medvedkin, >I don't think it is a good idea to add extra function because: > >1) it is just a copy of an existing rte_lpm_lookupx4() except the last 4 >ternary ops Yes, but I had no other option as modifying the current function will break ABI ☹. > >2) What is a real world use case for that? Usually returned value is >used as an index in an array of next_hop structs. If we take l3fwd as an example the next hop holds fwd port_id whereas the default value Passed holds mbuf->port. This allows Tx without having a branch. Event devices can aggregate packets from multiple ethernet ports and schedule them on a core. The current API requires us to pass a BAD_PORT and compare the result for every packet but if we are allowed to pass 4 different default values we could seamlessly send them for Tx. > >3) You can have the same result by using special unused defv and >pcmpeqd/vpblendd on a hop[4] after lookup Yes, but sadly that would be architecture depended. > >On 11/01/2020 16:08, pbhagavatula@marvell.com wrote: >> From: Pavan Nikhilesh <pbhagavatula@marvell.com> >> >> Add lookup x4 with x4 default values. >> This can be used in usecases where we have to process burst of >packets >> from different ports. >> >> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> >> --- >> app/test/test_lpm_perf.c | 31 +++++++++ >> lib/librte_lpm/rte_lpm.h | 23 +++++++ >> lib/librte_lpm/rte_lpm_altivec.h | 109 >+++++++++++++++++++++++++++++++ >> lib/librte_lpm/rte_lpm_neon.h | 102 >+++++++++++++++++++++++++++++ >> lib/librte_lpm/rte_lpm_sse.h | 104 >+++++++++++++++++++++++++++++ >> 5 files changed, 369 insertions(+) >> >> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c >> index a2578fe90..8e9d4c7eb 100644 >> --- a/app/test/test_lpm_perf.c >> +++ b/app/test/test_lpm_perf.c >> @@ -460,6 +460,37 @@ test_lpm_perf(void) >> (double)total_time / ((double)ITERATIONS * >BATCH_SIZE), >> (count * 100.0) / (double)(ITERATIONS * >BATCH_SIZE)); >> >> + /* Measure LookupX4 DefaultX4 */ >> + total_time = 0; >> + count = 0; >> + uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, >UINT32_MAX}; >> + for (i = 0; i < ITERATIONS; i++) { >> + static uint32_t ip_batch[BATCH_SIZE]; >> + uint32_t next_hops[4]; >> + >> + /* Create array of random IP addresses */ >> + for (j = 0; j < BATCH_SIZE; j++) >> + ip_batch[j] = rte_rand(); >> + >> + /* Lookup per batch */ >> + begin = rte_rdtsc(); >> + for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) { >> + unsigned int k; >> + xmm_t ipx4; >> + >> + ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + >j)); >> + ipx4 = *(xmm_t *)(ip_batch + j); >> + rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops, >def); >> + for (k = 0; k < RTE_DIM(next_hops); k++) >> + if (unlikely(next_hops[k] == >UINT32_MAX)) >> + count++; >> + } >> + >> + total_time += rte_rdtsc() - begin; >> + } >> + printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n", >> + (double)total_time / ((double)ITERATIONS * >BATCH_SIZE), >> + (count * 100.0) / (double)(ITERATIONS * >BATCH_SIZE)); >> /* Measure Delete */ >> status = 0; >> begin = rte_rdtsc(); >> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h >> index b9d49ac87..e66b43e06 100644 >> --- a/lib/librte_lpm/rte_lpm.h >> +++ b/lib/librte_lpm/rte_lpm.h >> @@ -370,6 +370,29 @@ static inline void >> rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t >hop[4], >> uint32_t defv); >> >> +/** >> + * Lookup four IP addresses in an LPM table. >> + * >> + * @param lpm >> + * LPM object handle >> + * @param ip >> + * Four IPs to be looked up in the LPM table >> + * @param hop >> + * Next hop of the most specific rule found for IP (valid on lookup >hit only). >> + * This is an 4 elements array of two byte values. >> + * If the lookup was successful for the given IP, then least significant >byte >> + * of the corresponding element is the actual next hop and the >most >> + * significant byte is zero. >> + * If the lookup for the given IP failed, then corresponding element >would >> + * contain default value, see description of then next parameter. >> + * @param defv >> + * Default value[] to populate into corresponding element of hop[] >array, >> + * if lookup would fail. >> + */ >> +static inline void >> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >uint32_t hop[4], >> + uint32_t defv[4]); >> + >> #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) >> #include "rte_lpm_neon.h" >> #elif defined(RTE_ARCH_PPC_64) >> diff --git a/lib/librte_lpm/rte_lpm_altivec.h >b/lib/librte_lpm/rte_lpm_altivec.h >> index 228c41b38..1afc7bd74 100644 >> --- a/lib/librte_lpm/rte_lpm_altivec.h >> +++ b/lib/librte_lpm/rte_lpm_altivec.h >> @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm >*lpm, xmm_t ip, uint32_t hop[4], >> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >0x00FFFFFF : defv; >> } >> >> +static inline void >> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >uint32_t hop[4], >> + uint32_t defv[4]) >> +{ >> + vector signed int i24; >> + rte_xmm_t i8; >> + uint32_t tbl[4]; >> + uint64_t idx, pt, pt2; >> + const uint32_t *ptbl; >> + >> + const uint32_t mask = UINT8_MAX; >> + const vector signed int mask8 = (xmm_t){mask, mask, mask, >mask}; >> + >> + /* >> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries >> + * as one 64-bit value (0x0300000003000000). >> + */ >> + const uint64_t mask_xv = >> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | >> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << >32); >> + >> + /* >> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries >> + * as one 64-bit value (0x0100000001000000). >> + */ >> + const uint64_t mask_v = >> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | >> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); >> + >> + /* get 4 indexes for tbl24[]. */ >> + i24 = vec_sr((xmm_t) ip, >> + (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, >CHAR_BIT}); >> + >> + /* extract values from tbl24[] */ >> + idx = (uint32_t)i24[0]; >> + idx = idx < (1<<24) ? idx : (1<<24)-1; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >> + tbl[0] = *ptbl; >> + >> + idx = (uint32_t) i24[1]; >> + idx = idx < (1<<24) ? idx : (1<<24)-1; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >> + tbl[1] = *ptbl; >> + >> + idx = (uint32_t) i24[2]; >> + idx = idx < (1<<24) ? idx : (1<<24)-1; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >> + tbl[2] = *ptbl; >> + >> + idx = (uint32_t) i24[3]; >> + idx = idx < (1<<24) ? idx : (1<<24)-1; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >> + tbl[3] = *ptbl; >> + >> + /* get 4 indexes for tbl8[]. */ >> + i8.x = vec_and(ip, mask8); >> + >> + pt = (uint64_t)tbl[0] | >> + (uint64_t)tbl[1] << 32; >> + pt2 = (uint64_t)tbl[2] | >> + (uint64_t)tbl[3] << 32; >> + >> + /* search successfully finished for all 4 IP addresses. */ >> + if (likely((pt & mask_xv) == mask_v) && >> + likely((pt2 & mask_xv) == mask_v)) { >> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; >> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; >> + return; >> + } >> + >> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[0] = i8.u32[0] + >> + (uint8_t)tbl[0] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; >> + tbl[0] = *ptbl; >> + } >> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) >== >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[1] = i8.u32[1] + >> + (uint8_t)tbl[1] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; >> + tbl[1] = *ptbl; >> + } >> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[2] = i8.u32[2] + >> + (uint8_t)tbl[2] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; >> + tbl[2] = *ptbl; >> + } >> + if (unlikely((pt2 >> 32 & >RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[3] = i8.u32[3] + >> + (uint8_t)tbl[3] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; >> + tbl[3] = *ptbl; >> + } >> + >> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & >0x00FFFFFF : >> + > defv[0]; >> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & >0x00FFFFFF : >> + > defv[1]; >> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & >0x00FFFFFF : >> + > defv[2]; >> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >0x00FFFFFF : >> + > defv[3]; >> +} >> + >> #ifdef __cplusplus >> } >> #endif >> diff --git a/lib/librte_lpm/rte_lpm_neon.h >b/lib/librte_lpm/rte_lpm_neon.h >> index 6c131d312..6ef635b18 100644 >> --- a/lib/librte_lpm/rte_lpm_neon.h >> +++ b/lib/librte_lpm/rte_lpm_neon.h >> @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm >*lpm, xmm_t ip, uint32_t hop[4], >> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >0x00FFFFFF : defv; >> } >> >> +static inline void >> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >uint32_t hop[4], >> + uint32_t defv[4]) >> +{ >> + uint32x4_t i24; >> + rte_xmm_t i8; >> + uint32_t tbl[4]; >> + uint64_t idx, pt, pt2; >> + const uint32_t *ptbl; >> + >> + const uint32_t mask = UINT8_MAX; >> + const int32x4_t mask8 = vdupq_n_s32(mask); >> + >> + /* >> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries >> + * as one 64-bit value (0x0300000003000000). >> + */ >> + const uint64_t mask_xv = >> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | >> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << >32); >> + >> + /* >> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries >> + * as one 64-bit value (0x0100000001000000). >> + */ >> + const uint64_t mask_v = >> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | >> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); >> + >> + /* get 4 indexes for tbl24[]. */ >> + i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT); >> + >> + /* extract values from tbl24[] */ >> + idx = vgetq_lane_u64((uint64x2_t)i24, 0); >> + >> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >> + tbl[0] = *ptbl; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >> + tbl[1] = *ptbl; >> + >> + idx = vgetq_lane_u64((uint64x2_t)i24, 1); >> + >> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >> + tbl[2] = *ptbl; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >> + tbl[3] = *ptbl; >> + >> + /* get 4 indexes for tbl8[]. */ >> + i8.x = vandq_s32(ip, mask8); >> + >> + pt = (uint64_t)tbl[0] | >> + (uint64_t)tbl[1] << 32; >> + pt2 = (uint64_t)tbl[2] | >> + (uint64_t)tbl[3] << 32; >> + >> + /* search successfully finished for all 4 IP addresses. */ >> + if (likely((pt & mask_xv) == mask_v) && >> + likely((pt2 & mask_xv) == mask_v)) { >> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; >> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; >> + return; >> + } >> + >> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[0] = i8.u32[0] + >> + (uint8_t)tbl[0] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; >> + tbl[0] = *ptbl; >> + } >> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) >== >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[1] = i8.u32[1] + >> + (uint8_t)tbl[1] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; >> + tbl[1] = *ptbl; >> + } >> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[2] = i8.u32[2] + >> + (uint8_t)tbl[2] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; >> + tbl[2] = *ptbl; >> + } >> + if (unlikely((pt2 >> 32 & >RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[3] = i8.u32[3] + >> + (uint8_t)tbl[3] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; >> + tbl[3] = *ptbl; >> + } >> + >> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & >0x00FFFFFF : >> + > defv[0]; >> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & >0x00FFFFFF : >> + > defv[1]; >> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & >0x00FFFFFF : >> + > defv[2]; >> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >0x00FFFFFF : >> + > defv[3]; >> +} >> + >> #ifdef __cplusplus >> } >> #endif >> diff --git a/lib/librte_lpm/rte_lpm_sse.h >b/lib/librte_lpm/rte_lpm_sse.h >> index 44770b6ff..6ef15816c 100644 >> --- a/lib/librte_lpm/rte_lpm_sse.h >> +++ b/lib/librte_lpm/rte_lpm_sse.h >> @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm >*lpm, xmm_t ip, uint32_t hop[4], >> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >0x00FFFFFF : defv; >> } >> >> +static inline void >> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >uint32_t hop[4], >> + uint32_t defv[4]) >> +{ >> + __m128i i24; >> + rte_xmm_t i8; >> + uint32_t tbl[4]; >> + uint64_t idx, pt, pt2; >> + const uint32_t *ptbl; >> + >> + const __m128i mask8 = >> + _mm_set_epi32(UINT8_MAX, UINT8_MAX, >UINT8_MAX, UINT8_MAX); >> + >> + /* >> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries >> + * as one 64-bit value (0x0300000003000000). >> + */ >> + const uint64_t mask_xv = >> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | >> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << >32); >> + >> + /* >> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries >> + * as one 64-bit value (0x0100000001000000). >> + */ >> + const uint64_t mask_v = >> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | >> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); >> + >> + /* get 4 indexes for tbl24[]. */ >> + i24 = _mm_srli_epi32(ip, CHAR_BIT); >> + >> + /* extract values from tbl24[] */ >> + idx = _mm_cvtsi128_si64(i24); >> + /* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a >constant */ >> + i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8); >> + >> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >> + tbl[0] = *ptbl; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >> + tbl[1] = *ptbl; >> + >> + idx = _mm_cvtsi128_si64(i24); >> + >> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >> + tbl[2] = *ptbl; >> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >> + tbl[3] = *ptbl; >> + >> + /* get 4 indexes for tbl8[]. */ >> + i8.x = _mm_and_si128(ip, mask8); >> + >> + pt = (uint64_t)tbl[0] | >> + (uint64_t)tbl[1] << 32; >> + pt2 = (uint64_t)tbl[2] | >> + (uint64_t)tbl[3] << 32; >> + >> + /* search successfully finished for all 4 IP addresses. */ >> + if (likely((pt & mask_xv) == mask_v) && >> + likely((pt2 & mask_xv) == mask_v)) { >> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; >> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; >> + return; >> + } >> + >> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[0] = i8.u32[0] + >> + (uint8_t)tbl[0] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; >> + tbl[0] = *ptbl; >> + } >> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) >== >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[1] = i8.u32[1] + >> + (uint8_t)tbl[1] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; >> + tbl[1] = *ptbl; >> + } >> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[2] = i8.u32[2] + >> + (uint8_t)tbl[2] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; >> + tbl[2] = *ptbl; >> + } >> + if (unlikely((pt2 >> 32 & >RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >> + i8.u32[3] = i8.u32[3] + >> + (uint8_t)tbl[3] * >RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; >> + tbl[3] = *ptbl; >> + } >> + >> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & >0x00FFFFFF : >> + > defv[0]; >> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & >0x00FFFFFF : >> + > defv[1]; >> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & >0x00FFFFFF : >> + > defv[2]; >> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >0x00FFFFFF : >> + > defv[3]; >> +} >> + >> #ifdef __cplusplus >> } >> #endif > >-- >Regards, >Vladimir
Hi, On 13/01/2020 12:34, Pavan Nikhilesh Bhagavatula wrote: >> -----Original Message----- >> From: dev <dev-bounces@dpdk.org> On Behalf Of Medvedkin, >> Vladimir >> Sent: Monday, January 13, 2020 4:37 PM >> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin >> Jacob Kollanukkaran <jerinj@marvell.com>; Bruce Richardson >> <bruce.richardson@intel.com>; Gavin Hu <gavin.hu@arm.com> >> Cc: dev@dpdk.org >> Subject: Re: [dpdk-dev] [PATCH] lmp: add lookup x4 with x4 default >> values >> >> Hi Pavan, >> > Hi Medvedkin, > >> I don't think it is a good idea to add extra function because: >> >> 1) it is just a copy of an existing rte_lpm_lookupx4() except the last 4 >> ternary ops > Yes, but I had no other option as modifying the current function will break ABI ☹. > >> 2) What is a real world use case for that? Usually returned value is >> used as an index in an array of next_hop structs. > If we take l3fwd as an example the next hop holds fwd port_id whereas the default value > Passed holds mbuf->port. This allows Tx without having a branch. > > Event devices can aggregate packets from multiple ethernet ports and schedule them on > a core. The current API requires us to pass a BAD_PORT and compare the result for every > packet but if we are allowed to pass 4 different default values we could seamlessly send > them for Tx. > >> 3) You can have the same result by using special unused defv and >> pcmpeqd/vpblendd on a hop[4] after lookup > Yes, but sadly that would be architecture depended. But rte_lpm_lookupx4() itself is architecture depended. My suggestion here would be - implement rte_lpm_lookupx4_defx4() in arch specific .c files as a wraper around rte_lpm_lookupx4() and do pcmpeqd/vpblendd stuff after. In this case you won't need to copy all of this implemented code. > >> On 11/01/2020 16:08, pbhagavatula@marvell.com wrote: >>> From: Pavan Nikhilesh <pbhagavatula@marvell.com> >>> >>> Add lookup x4 with x4 default values. >>> This can be used in usecases where we have to process burst of >> packets >>> from different ports. >>> >>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> >>> --- >>> app/test/test_lpm_perf.c | 31 +++++++++ >>> lib/librte_lpm/rte_lpm.h | 23 +++++++ >>> lib/librte_lpm/rte_lpm_altivec.h | 109 >> +++++++++++++++++++++++++++++++ >>> lib/librte_lpm/rte_lpm_neon.h | 102 >> +++++++++++++++++++++++++++++ >>> lib/librte_lpm/rte_lpm_sse.h | 104 >> +++++++++++++++++++++++++++++ >>> 5 files changed, 369 insertions(+) >>> >>> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c >>> index a2578fe90..8e9d4c7eb 100644 >>> --- a/app/test/test_lpm_perf.c >>> +++ b/app/test/test_lpm_perf.c >>> @@ -460,6 +460,37 @@ test_lpm_perf(void) >>> (double)total_time / ((double)ITERATIONS * >> BATCH_SIZE), >>> (count * 100.0) / (double)(ITERATIONS * >> BATCH_SIZE)); >>> + /* Measure LookupX4 DefaultX4 */ >>> + total_time = 0; >>> + count = 0; >>> + uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, >> UINT32_MAX}; >>> + for (i = 0; i < ITERATIONS; i++) { >>> + static uint32_t ip_batch[BATCH_SIZE]; >>> + uint32_t next_hops[4]; >>> + >>> + /* Create array of random IP addresses */ >>> + for (j = 0; j < BATCH_SIZE; j++) >>> + ip_batch[j] = rte_rand(); >>> + >>> + /* Lookup per batch */ >>> + begin = rte_rdtsc(); >>> + for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) { >>> + unsigned int k; >>> + xmm_t ipx4; >>> + >>> + ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + >> j)); >>> + ipx4 = *(xmm_t *)(ip_batch + j); >>> + rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops, >> def); >>> + for (k = 0; k < RTE_DIM(next_hops); k++) >>> + if (unlikely(next_hops[k] == >> UINT32_MAX)) >>> + count++; >>> + } >>> + >>> + total_time += rte_rdtsc() - begin; >>> + } >>> + printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n", >>> + (double)total_time / ((double)ITERATIONS * >> BATCH_SIZE), >>> + (count * 100.0) / (double)(ITERATIONS * >> BATCH_SIZE)); >>> /* Measure Delete */ >>> status = 0; >>> begin = rte_rdtsc(); >>> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h >>> index b9d49ac87..e66b43e06 100644 >>> --- a/lib/librte_lpm/rte_lpm.h >>> +++ b/lib/librte_lpm/rte_lpm.h >>> @@ -370,6 +370,29 @@ static inline void >>> rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t >> hop[4], >>> uint32_t defv); >>> >>> +/** >>> + * Lookup four IP addresses in an LPM table. >>> + * >>> + * @param lpm >>> + * LPM object handle >>> + * @param ip >>> + * Four IPs to be looked up in the LPM table >>> + * @param hop >>> + * Next hop of the most specific rule found for IP (valid on lookup >> hit only). >>> + * This is an 4 elements array of two byte values. >>> + * If the lookup was successful for the given IP, then least significant >> byte >>> + * of the corresponding element is the actual next hop and the >> most >>> + * significant byte is zero. >>> + * If the lookup for the given IP failed, then corresponding element >> would >>> + * contain default value, see description of then next parameter. >>> + * @param defv >>> + * Default value[] to populate into corresponding element of hop[] >> array, >>> + * if lookup would fail. >>> + */ >>> +static inline void >>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >> uint32_t hop[4], >>> + uint32_t defv[4]); >>> + >>> #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) >>> #include "rte_lpm_neon.h" >>> #elif defined(RTE_ARCH_PPC_64) >>> diff --git a/lib/librte_lpm/rte_lpm_altivec.h >> b/lib/librte_lpm/rte_lpm_altivec.h >>> index 228c41b38..1afc7bd74 100644 >>> --- a/lib/librte_lpm/rte_lpm_altivec.h >>> +++ b/lib/librte_lpm/rte_lpm_altivec.h >>> @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm >> *lpm, xmm_t ip, uint32_t hop[4], >>> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >> 0x00FFFFFF : defv; >>> } >>> >>> +static inline void >>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >> uint32_t hop[4], >>> + uint32_t defv[4]) >>> +{ >>> + vector signed int i24; >>> + rte_xmm_t i8; >>> + uint32_t tbl[4]; >>> + uint64_t idx, pt, pt2; >>> + const uint32_t *ptbl; >>> + >>> + const uint32_t mask = UINT8_MAX; >>> + const vector signed int mask8 = (xmm_t){mask, mask, mask, >> mask}; >>> + >>> + /* >>> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries >>> + * as one 64-bit value (0x0300000003000000). >>> + */ >>> + const uint64_t mask_xv = >>> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | >>> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << >> 32); >>> + >>> + /* >>> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries >>> + * as one 64-bit value (0x0100000001000000). >>> + */ >>> + const uint64_t mask_v = >>> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | >>> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); >>> + >>> + /* get 4 indexes for tbl24[]. */ >>> + i24 = vec_sr((xmm_t) ip, >>> + (vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, >> CHAR_BIT}); >>> + >>> + /* extract values from tbl24[] */ >>> + idx = (uint32_t)i24[0]; >>> + idx = idx < (1<<24) ? idx : (1<<24)-1; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >>> + tbl[0] = *ptbl; >>> + >>> + idx = (uint32_t) i24[1]; >>> + idx = idx < (1<<24) ? idx : (1<<24)-1; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >>> + tbl[1] = *ptbl; >>> + >>> + idx = (uint32_t) i24[2]; >>> + idx = idx < (1<<24) ? idx : (1<<24)-1; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >>> + tbl[2] = *ptbl; >>> + >>> + idx = (uint32_t) i24[3]; >>> + idx = idx < (1<<24) ? idx : (1<<24)-1; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx]; >>> + tbl[3] = *ptbl; >>> + >>> + /* get 4 indexes for tbl8[]. */ >>> + i8.x = vec_and(ip, mask8); >>> + >>> + pt = (uint64_t)tbl[0] | >>> + (uint64_t)tbl[1] << 32; >>> + pt2 = (uint64_t)tbl[2] | >>> + (uint64_t)tbl[3] << 32; >>> + >>> + /* search successfully finished for all 4 IP addresses. */ >>> + if (likely((pt & mask_xv) == mask_v) && >>> + likely((pt2 & mask_xv) == mask_v)) { >>> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; >>> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; >>> + return; >>> + } >>> + >>> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[0] = i8.u32[0] + >>> + (uint8_t)tbl[0] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; >>> + tbl[0] = *ptbl; >>> + } >>> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) >> == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[1] = i8.u32[1] + >>> + (uint8_t)tbl[1] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; >>> + tbl[1] = *ptbl; >>> + } >>> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[2] = i8.u32[2] + >>> + (uint8_t)tbl[2] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; >>> + tbl[2] = *ptbl; >>> + } >>> + if (unlikely((pt2 >> 32 & >> RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[3] = i8.u32[3] + >>> + (uint8_t)tbl[3] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; >>> + tbl[3] = *ptbl; >>> + } >>> + >>> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & >> 0x00FFFFFF : >>> + >> defv[0]; >>> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & >> 0x00FFFFFF : >>> + >> defv[1]; >>> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & >> 0x00FFFFFF : >>> + >> defv[2]; >>> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >> 0x00FFFFFF : >>> + >> defv[3]; >>> +} >>> + >>> #ifdef __cplusplus >>> } >>> #endif >>> diff --git a/lib/librte_lpm/rte_lpm_neon.h >> b/lib/librte_lpm/rte_lpm_neon.h >>> index 6c131d312..6ef635b18 100644 >>> --- a/lib/librte_lpm/rte_lpm_neon.h >>> +++ b/lib/librte_lpm/rte_lpm_neon.h >>> @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm >> *lpm, xmm_t ip, uint32_t hop[4], >>> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >> 0x00FFFFFF : defv; >>> } >>> >>> +static inline void >>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >> uint32_t hop[4], >>> + uint32_t defv[4]) >>> +{ >>> + uint32x4_t i24; >>> + rte_xmm_t i8; >>> + uint32_t tbl[4]; >>> + uint64_t idx, pt, pt2; >>> + const uint32_t *ptbl; >>> + >>> + const uint32_t mask = UINT8_MAX; >>> + const int32x4_t mask8 = vdupq_n_s32(mask); >>> + >>> + /* >>> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries >>> + * as one 64-bit value (0x0300000003000000). >>> + */ >>> + const uint64_t mask_xv = >>> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | >>> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << >> 32); >>> + >>> + /* >>> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries >>> + * as one 64-bit value (0x0100000001000000). >>> + */ >>> + const uint64_t mask_v = >>> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | >>> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); >>> + >>> + /* get 4 indexes for tbl24[]. */ >>> + i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT); >>> + >>> + /* extract values from tbl24[] */ >>> + idx = vgetq_lane_u64((uint64x2_t)i24, 0); >>> + >>> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >>> + tbl[0] = *ptbl; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >>> + tbl[1] = *ptbl; >>> + >>> + idx = vgetq_lane_u64((uint64x2_t)i24, 1); >>> + >>> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >>> + tbl[2] = *ptbl; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >>> + tbl[3] = *ptbl; >>> + >>> + /* get 4 indexes for tbl8[]. */ >>> + i8.x = vandq_s32(ip, mask8); >>> + >>> + pt = (uint64_t)tbl[0] | >>> + (uint64_t)tbl[1] << 32; >>> + pt2 = (uint64_t)tbl[2] | >>> + (uint64_t)tbl[3] << 32; >>> + >>> + /* search successfully finished for all 4 IP addresses. */ >>> + if (likely((pt & mask_xv) == mask_v) && >>> + likely((pt2 & mask_xv) == mask_v)) { >>> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; >>> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; >>> + return; >>> + } >>> + >>> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[0] = i8.u32[0] + >>> + (uint8_t)tbl[0] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; >>> + tbl[0] = *ptbl; >>> + } >>> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) >> == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[1] = i8.u32[1] + >>> + (uint8_t)tbl[1] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; >>> + tbl[1] = *ptbl; >>> + } >>> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[2] = i8.u32[2] + >>> + (uint8_t)tbl[2] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; >>> + tbl[2] = *ptbl; >>> + } >>> + if (unlikely((pt2 >> 32 & >> RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[3] = i8.u32[3] + >>> + (uint8_t)tbl[3] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; >>> + tbl[3] = *ptbl; >>> + } >>> + >>> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & >> 0x00FFFFFF : >>> + >> defv[0]; >>> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & >> 0x00FFFFFF : >>> + >> defv[1]; >>> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & >> 0x00FFFFFF : >>> + >> defv[2]; >>> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >> 0x00FFFFFF : >>> + >> defv[3]; >>> +} >>> + >>> #ifdef __cplusplus >>> } >>> #endif >>> diff --git a/lib/librte_lpm/rte_lpm_sse.h >> b/lib/librte_lpm/rte_lpm_sse.h >>> index 44770b6ff..6ef15816c 100644 >>> --- a/lib/librte_lpm/rte_lpm_sse.h >>> +++ b/lib/librte_lpm/rte_lpm_sse.h >>> @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm >> *lpm, xmm_t ip, uint32_t hop[4], >>> hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >> 0x00FFFFFF : defv; >>> } >>> >>> +static inline void >>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, >> uint32_t hop[4], >>> + uint32_t defv[4]) >>> +{ >>> + __m128i i24; >>> + rte_xmm_t i8; >>> + uint32_t tbl[4]; >>> + uint64_t idx, pt, pt2; >>> + const uint32_t *ptbl; >>> + >>> + const __m128i mask8 = >>> + _mm_set_epi32(UINT8_MAX, UINT8_MAX, >> UINT8_MAX, UINT8_MAX); >>> + >>> + /* >>> + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries >>> + * as one 64-bit value (0x0300000003000000). >>> + */ >>> + const uint64_t mask_xv = >>> + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | >>> + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << >> 32); >>> + >>> + /* >>> + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries >>> + * as one 64-bit value (0x0100000001000000). >>> + */ >>> + const uint64_t mask_v = >>> + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | >>> + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); >>> + >>> + /* get 4 indexes for tbl24[]. */ >>> + i24 = _mm_srli_epi32(ip, CHAR_BIT); >>> + >>> + /* extract values from tbl24[] */ >>> + idx = _mm_cvtsi128_si64(i24); >>> + /* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a >> constant */ >>> + i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8); >>> + >>> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >>> + tbl[0] = *ptbl; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >>> + tbl[1] = *ptbl; >>> + >>> + idx = _mm_cvtsi128_si64(i24); >>> + >>> + ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx]; >>> + tbl[2] = *ptbl; >>> + ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32]; >>> + tbl[3] = *ptbl; >>> + >>> + /* get 4 indexes for tbl8[]. */ >>> + i8.x = _mm_and_si128(ip, mask8); >>> + >>> + pt = (uint64_t)tbl[0] | >>> + (uint64_t)tbl[1] << 32; >>> + pt2 = (uint64_t)tbl[2] | >>> + (uint64_t)tbl[3] << 32; >>> + >>> + /* search successfully finished for all 4 IP addresses. */ >>> + if (likely((pt & mask_xv) == mask_v) && >>> + likely((pt2 & mask_xv) == mask_v)) { >>> + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; >>> + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; >>> + return; >>> + } >>> + >>> + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[0] = i8.u32[0] + >>> + (uint8_t)tbl[0] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; >>> + tbl[0] = *ptbl; >>> + } >>> + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) >> == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[1] = i8.u32[1] + >>> + (uint8_t)tbl[1] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; >>> + tbl[1] = *ptbl; >>> + } >>> + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[2] = i8.u32[2] + >>> + (uint8_t)tbl[2] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; >>> + tbl[2] = *ptbl; >>> + } >>> + if (unlikely((pt2 >> 32 & >> RTE_LPM_VALID_EXT_ENTRY_BITMASK) == >>> + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { >>> + i8.u32[3] = i8.u32[3] + >>> + (uint8_t)tbl[3] * >> RTE_LPM_TBL8_GROUP_NUM_ENTRIES; >>> + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; >>> + tbl[3] = *ptbl; >>> + } >>> + >>> + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & >> 0x00FFFFFF : >>> + >> defv[0]; >>> + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & >> 0x00FFFFFF : >>> + >> defv[1]; >>> + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & >> 0x00FFFFFF : >>> + >> defv[2]; >>> + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & >> 0x00FFFFFF : >>> + >> defv[3]; >>> +} >>> + >>> #ifdef __cplusplus >>> } >>> #endif >> -- >> Regards, >> Vladimir -- Regards, Vladimir