On Sun, Jun 12, 2022 at 11:22 PM Ruifeng Wang wrote: > > To read descriptors in expected order, barriers are inserted after each > descriptor read. The excessive use of barriers is unnecessary and could > cause performance drop. > > Removed barriers between descriptor reads. And changed counting of valid > packets so as to handle discontinuous valid packets. Because out of > order read could lead to valid descriptors that fetched being > discontinuous. > > In VPP L3 routing test, 6% performance gain was observed. The test was > done on a platform with ThunderX2 CPU and Broadcom PS225 NIC. > > Signed-off-by: Ruifeng Wang Reviewed-by: Ajit Khaparde Patch applied to dpdk-next-net-brcm. Thanks > > --- > drivers/net/bnxt/bnxt_rxtx_vec_neon.c | 47 ++++++++++++++------------- > 1 file changed, 25 insertions(+), 22 deletions(-) > > diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c > index 32f8e59b3a..6a4ece681b 100644 > --- a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c > +++ b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c > @@ -235,34 +235,32 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) > * IO barriers are used to ensure consistent state. > */ > rxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]); > - rte_io_rmb(); > + rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]); > + rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]); > + rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]); > + > + /* Use acquire fence to order loads of descriptor words. */ > + rte_atomic_thread_fence(__ATOMIC_ACQUIRE); > /* Reload lower 64b of descriptors to make it ordered after info3_v. */ > rxcmp1[3] = vreinterpretq_u32_u64(vld1q_lane_u64 > ((void *)&cpr->cp_desc_ring[cons + 7], > vreinterpretq_u64_u32(rxcmp1[3]), 0)); > - rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]); > - > - rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]); > - rte_io_rmb(); > rxcmp1[2] = vreinterpretq_u32_u64(vld1q_lane_u64 > ((void *)&cpr->cp_desc_ring[cons + 5], > vreinterpretq_u64_u32(rxcmp1[2]), 0)); > - rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]); > - > - t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3])); > - > - rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]); > - rte_io_rmb(); > rxcmp1[1] = vreinterpretq_u32_u64(vld1q_lane_u64 > ((void *)&cpr->cp_desc_ring[cons + 3], > vreinterpretq_u64_u32(rxcmp1[1]), 0)); > - rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]); > - > - rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]); > - rte_io_rmb(); > rxcmp1[0] = vreinterpretq_u32_u64(vld1q_lane_u64 > ((void *)&cpr->cp_desc_ring[cons + 1], > vreinterpretq_u64_u32(rxcmp1[0]), 0)); > + > + rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]); > + rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]); > + > + t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3])); > + > + rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]); > rxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]); > > t0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1])); > @@ -278,16 +276,21 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) > * bits and count the number of set bits in order to determine > * the number of valid descriptors. > */ > - valid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)), > - 0); > + valid = vget_lane_u64(vreinterpret_u64_s16(vshr_n_s16 > + (vreinterpret_s16_u16(vshl_n_u16 > + (vqmovn_u32(info3_v), 15)), 15)), 0); > + > /* > * At this point, 'valid' is a 64-bit value containing four > - * 16-bit fields, each of which is either 0x0001 or 0x0000. > - * Compute number of valid descriptors from the index of > - * the highest non-zero field. > + * 16-bit fields, each of which is either 0xffff or 0x0000. > + * Count the number of consecutive 1s from LSB in order to > + * determine the number of valid descriptors. > */ > - num_valid = (sizeof(uint64_t) / sizeof(uint16_t)) - > - (__builtin_clzl(valid & desc_valid_mask) / 16); > + valid = ~(valid & desc_valid_mask); > + if (valid == 0) > + num_valid = 4; > + else > + num_valid = __builtin_ctzl(valid) / 16; > > if (num_valid == 0) > break; > -- > 2.25.1 >