From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id C8BD4A04B5; Fri, 11 Sep 2020 14:09:35 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 506571C112; Fri, 11 Sep 2020 14:09:25 +0200 (CEST) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by dpdk.org (Postfix) with ESMTP id 6D2351C0DC for ; Fri, 11 Sep 2020 14:09:23 +0200 (CEST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id ECF991396; Fri, 11 Sep 2020 05:09:22 -0700 (PDT) Received: from net-arm-thunderx2-03.shanghai.arm.com (net-arm-thunderx2-03.shanghai.arm.com [10.169.210.123]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 785673F68F; Fri, 11 Sep 2020 05:09:20 -0700 (PDT) From: Joyce Kong To: maxime.coquelin@redhat.com Cc: jerinj@marvell.com, dev@dpdk.org, nd@arm.com, honnappa.nagarahalli@arm.com, ruifeng.wang@arm.com, phil.yang@arm.com Date: Fri, 11 Sep 2020 20:09:05 +0800 Message-Id: <20200911120906.45995-3-joyce.kong@arm.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20200911120906.45995-1-joyce.kong@arm.com> References: <20200911120906.45995-1-joyce.kong@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Optimize packed ring Rx batch path with NEON instructions. Signed-off-by: Joyce Kong --- drivers/net/virtio/meson.build | 1 + drivers/net/virtio/virtio_rxtx.c | 7 +- drivers/net/virtio/virtio_rxtx_packed.h | 16 ++ drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++++++++ 4 files changed, 224 insertions(+), 2 deletions(-) create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build index e1851b0a6..5af633686 100644 --- a/drivers/net/virtio/meson.build +++ b/drivers/net/virtio/meson.build @@ -34,6 +34,7 @@ elif arch_subdir == 'ppc' sources += files('virtio_rxtx_simple_altivec.c') elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64') sources += files('virtio_rxtx_simple_neon.c') + sources += files('virtio_rxtx_packed_neon.c') endif if is_linux diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index f915b8a2c..1deb77569 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -2020,7 +2020,8 @@ virtio_xmit_pkts_inorder(void *tx_queue, return nb_tx; } -#ifndef CC_AVX512_SUPPORT +#if !defined(CC_AVX512_SUPPORT) && !defined(RTE_ARCH_ARM) && \ + !defined(RTE_ARCH_ARM64) uint16_t virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused, struct rte_mbuf **rx_pkts __rte_unused, @@ -2028,7 +2029,9 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused, { return 0; } +#endif +#if !defined(CC_AVX512_SUPPORT) uint16_t virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused, struct rte_mbuf **tx_pkts __rte_unused, @@ -2036,4 +2039,4 @@ virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused, { return 0; } -#endif /* ifndef CC_AVX512_SUPPORT */ +#endif diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h index b2447843b..fd2d6baa5 100644 --- a/drivers/net/virtio/virtio_rxtx_packed.h +++ b/drivers/net/virtio/virtio_rxtx_packed.h @@ -19,9 +19,16 @@ #include "virtqueue.h" #define BYTE_SIZE 8 + +#if defined(AVX512_SUPPORT) /* flag bits offset in packed ring desc higher 64bits */ #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) +#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +/* flag bits offset in packed ring desc 32bits */ +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ + offsetof(struct vring_packed_desc, id)) * BYTE_SIZE) +#endif #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \ FLAGS_BITS_OFFSET) @@ -44,8 +51,17 @@ /* net hdr short size mask */ #define NET_HDR_MASK 0x3F +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +/* The cache line size on different aarh64 platforms are + * different, so put a four batch size here to match with + * the minimum cache line size. + */ +#define PACKED_BATCH_SIZE 4 +#else #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) +#endif + #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) #ifdef VIRTIO_GCC_UNROLL_PRAGMA diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.c b/drivers/net/virtio/virtio_rxtx_packed_neon.c new file mode 100644 index 000000000..182afe5c6 --- /dev/null +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.c @@ -0,0 +1,202 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "virtio_logs.h" +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtio_rxtx_packed.h" +#include "virtqueue.h" + +static inline uint16_t +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq, + struct rte_mbuf **rx_pkts) +{ + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t head_size = hw->vtnet_hdr_size; + uint16_t id = vq->vq_used_cons_idx; + struct vring_packed_desc *p_desc; + uint16_t i; + + if (id & PACKED_BATCH_MASK) + return -1; + + if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries)) + return -1; + + /* Map packed descriptor to mbuf fields. */ + uint8x16_t shuf_msk1 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */ + 0, 1, /* octet 1~0, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */ + 0, 1, /* octet 1~0, 16 bits data_len */ + 0xFF, 0xFF, /* vlan tci set as unknown */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + uint8x16_t shuf_msk2 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */ + 8, 9, /* octet 9~8, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */ + 8, 9, /* octet 9~8, 16 bits data_len */ + 0xFF, 0xFF, /* vlan tci set as unknown */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + /* Subtract the header length. */ + uint16x8_t len_adjust = { + 0, 0, /* ignore pkt_type field */ + head_size, /* sub head_size on pkt_len */ + 0, /* ignore high 16 bits of pkt_len */ + head_size, /* sub head_size on data_len */ + 0, 0, 0 /* ignore non-length fields */ + }; + + uint64x2_t desc[PACKED_BATCH_SIZE / 2]; + uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2]; + uint64x2_t pkt_mb[PACKED_BATCH_SIZE]; + + p_desc = &vq->vq_packed.ring.desc[id]; + /* Load packed descriptor 0,1. */ + desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1]; + /* Load packed descriptor 2,3. */ + desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1]; + + /* Only care avail/used bits. */ + uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK); + uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]), + vreinterpretq_u32_u64(desc[1])); + uint32x4_t v_flag = vandq_u32(v_desc, v_mask); + + uint32x4_t v_used_flag = vdupq_n_u32(0); + if (vq->vq_packed.used_wrap_counter) + v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK); + + poly128_t desc_stats = vreinterpretq_p128_u32(vceqq_u32(v_flag, + v_used_flag)); + + /* Check all descs are used. */ + if (!desc_stats) + return -1; + + /* Load 2 mbuf pointers per time. */ + mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]); + vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]); + + mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]); + vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]); + + /** + * Update data length and packet length for descriptor. + * structure of pkt_mb: + * -------------------------------------------------------------------- + * |4 octet pkt_type|4 octet pkt_len|2 octet data_len|2 octet vlan_tci| + * -------------------------------------------------------------------- + */ + pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk1)); + pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk2)); + pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk1)); + pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk2)); + + pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[0]), len_adjust)); + pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[1]), len_adjust)); + pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[2]), len_adjust)); + pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[3]), len_adjust)); + + vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]); + vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]); + vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]); + vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]); + + if (hw->has_rx_offload) { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + char *addr = (char *)rx_pkts[i]->buf_addr + + RTE_PKTMBUF_HEADROOM - head_size; + virtio_vec_rx_offload(rx_pkts[i], + (struct virtio_net_hdr *)addr); + } + } + + virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len, + rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len, + rx_pkts[3]->pkt_len); + + vq->vq_free_cnt += PACKED_BATCH_SIZE; + + vq->vq_used_cons_idx += PACKED_BATCH_SIZE; + if (vq->vq_used_cons_idx >= vq->vq_nentries) { + vq->vq_used_cons_idx -= vq->vq_nentries; + vq->vq_packed.used_wrap_counter ^= 1; + } + + return 0; +} + +uint16_t +virtio_recv_pkts_packed_vec(void *rx_queue, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_rx *rxvq = rx_queue; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t num, nb_rx = 0; + uint32_t nb_enqueued = 0; + uint16_t free_cnt = vq->vq_free_thresh; + + if (unlikely(hw->started == 0)) + return nb_rx; + + num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts); + if (likely(num > PACKED_BATCH_SIZE)) + num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE); + + while (num) { + if (!virtqueue_dequeue_batch_packed_vec(rxvq, + &rx_pkts[nb_rx])) { + nb_rx += PACKED_BATCH_SIZE; + num -= PACKED_BATCH_SIZE; + continue; + } + if (!virtqueue_dequeue_single_packed_vec(rxvq, + &rx_pkts[nb_rx])) { + nb_rx++; + num--; + continue; + } + break; + }; + + PMD_RX_LOG(DEBUG, "dequeue:%d", num); + + rxvq->stats.packets += nb_rx; + + if (likely(vq->vq_free_cnt >= free_cnt)) { + struct rte_mbuf *new_pkts[free_cnt]; + if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts, + free_cnt) == 0)) { + virtio_recv_refill_packed_vec(rxvq, new_pkts, + free_cnt); + nb_enqueued += free_cnt; + } else { + struct rte_eth_dev *dev = + &rte_eth_devices[rxvq->port_id]; + dev->data->rx_mbuf_alloc_failed += free_cnt; + } + } + + if (likely(nb_enqueued)) { + if (unlikely(virtqueue_kick_prepare_packed(vq))) { + virtqueue_notify(vq); + PMD_RX_LOG(DEBUG, "Notified"); + } + } + + return nb_rx; +} -- 2.28.0