From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 3CF4946623; Fri, 25 Apr 2025 04:38:12 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 1A35E40676; Fri, 25 Apr 2025 04:37:11 +0200 (CEST) Received: from cstnet.cn (smtp21.cstnet.cn [159.226.251.21]) by mails.dpdk.org (Postfix) with ESMTP id 115094060C for ; Fri, 25 Apr 2025 04:37:01 +0200 (CEST) Received: from localhost.localdomain (unknown [60.29.3.194]) by APP-01 (Coremail) with SMTP id qwCowACnSvtG9QpoC2i4Cw--.963S13; Fri, 25 Apr 2025 10:36:56 +0800 (CST) From: Jie Liu To: stephen@networkplumber.org Cc: dev@dpdk.org, JieLiu Subject: [PATCH 12/13] net/sxe: add simd function Date: Thu, 24 Apr 2025 19:36:51 -0700 Message-Id: <20250425023652.37368-12-liujie5@linkdatatechnology.com> X-Mailer: git-send-email 2.27.0 In-Reply-To: <20250425023652.37368-1-liujie5@linkdatatechnology.com> References: <20250425023652.37368-1-liujie5@linkdatatechnology.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-CM-TRANSID: qwCowACnSvtG9QpoC2i4Cw--.963S13 X-Coremail-Antispam: 1UD129KBjvAXoWDZr4UtFW8CFWxXF13Jr4DCFg_yoWxXw17Zo WIvw4fJF4fuF1xAr4kuw1UuFy7ZF1S9ry5CFZY9393Za4rCr1SkryxAw1rAF98Gw10kFn5 Xa4xta9ayrZxJrWfn29KB7ZKAUJUUUU5529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3 AaLaJ3UjIYCTnIWjp_UUUYK7AC8VAFwI0_Wr0E3s1l1xkIjI8I6I8E6xAIw20EY4v20xva j40_Wr0E3s1l1IIY67AEw4v_Jr0_Jr4l82xGYIkIc2x26280x7IE14v26r126s0DM28Irc Ia0xkI8VCY1x0267AKxVW5JVCq3wA2ocxC64kIII0Yj41l84x0c7CEw4AK67xGY2AK021l 84ACjcxK6xIIjxv20xvE14v26ryj6F1UM28EF7xvwVC0I7IYx2IY6xkF7I0E14v26r4j6F 4UM28EF7xvwVC2z280aVAFwI0_Gr0_Cr1l84ACjcxK6I8E87Iv6xkF7I0E14v26r4j6r4U JwAS0I0E0xvYzxvE52x082IY62kv0487Mc02F40EFcxC0VAKzVAqx4xG6I80ewAv7VC0I7 IYx2IY67AKxVWUGVWUXwAv7VC2z280aVAFwI0_Jr0_Gr1lOx8S6xCaFVCjc4AY6r1j6r4U M4x0Y48IcxkI7VAKI48JM4x0x7Aq67IIx4CEVc8vx2IErcIFxwCY02Avz4vE14v_Gr1l42 xK82IYc2Ij64vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWU GwC20s026x8GjcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1Y6r17MIIYrxkI7VAKI4 8JMIIF0xvE2Ix0cI8IcVAFwI0_Xr0_Ar1lIxAIcVC0I7IYx2IY6xkF7I0E14v26r4j6F4U MIIF0xvE42xK8VAvwI8IcIk0rVWUJVWUCwCI42IY6I8E87Iv67AKxVW8JVWxJwCI42IY6I 8E87Iv6xkF7I0E14v26r4j6r4UJbIYCTnIWIevJa73UjIFyTuYvjfUrCztUUUUU X-Originating-IP: [60.29.3.194] X-CM-SenderInfo: xolxyxrhv6zxpqngt3pdwhux5qro0w31of0z/ X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org From: JieLiu Add simd function. Signed-off-by: Jie Liu --- drivers/net/sxe/Makefile | 7 + drivers/net/sxe/base/sxe_queue_common.c | 55 ++ drivers/net/sxe/base/sxe_rx_common.c | 145 ++++- drivers/net/sxe/meson.build | 9 + drivers/net/sxe/pf/sxe.h | 3 + drivers/net/sxe/pf/sxe_ethdev.c | 5 + drivers/net/sxe/pf/sxe_rx.c | 3 + drivers/net/sxe/pf/sxe_vec_common.h | 325 ++++++++++ drivers/net/sxe/pf/sxe_vec_neon.c | 760 ++++++++++++++++++++++++ drivers/net/sxe/pf/sxe_vec_sse.c | 638 ++++++++++++++++++++ 10 files changed, 1948 insertions(+), 2 deletions(-) create mode 100644 drivers/net/sxe/pf/sxe_vec_common.h create mode 100644 drivers/net/sxe/pf/sxe_vec_neon.c create mode 100644 drivers/net/sxe/pf/sxe_vec_sse.c diff --git a/drivers/net/sxe/Makefile b/drivers/net/sxe/Makefile index 8e1e2a53a2..17c24861db 100644 --- a/drivers/net/sxe/Makefile +++ b/drivers/net/sxe/Makefile @@ -11,6 +11,7 @@ LIB = librte_pmd_sxe.a CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += -DSXE_DPDK CFLAGS += -DSXE_HOST_DRIVER +CFLAGS += -DSXE_DPDK_SIMD CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) @@ -80,6 +81,12 @@ SRCS-$(CONFIG_RTE_LIBRTE_SXE_PMD) += sxe_rx.c SRCS-$(CONFIG_RTE_LIBRTE_SXE_PMD) += sxe_stats.c SRCS-$(CONFIG_RTE_LIBRTE_SXE_PMD) += sxe_tx.c +ifeq ($(CONFIG_RTE_ARCH_ARM64),y) +SRCS-$(CONFIG_RTE_LIBRTE_SXE_PMD) += sxe_vec_neon.c +else +SRCS-$(CONFIG_RTE_LIBRTE_SXE_PMD) += sxe_vec_sse.c +endif + # install this header file SYMLINK-$(CONFIG_RTE_LIBRTE_SXE_PMD)-include := rte_pmd_sxe.h SYMLINK-$(CONFIG_RTE_LIBRTE_SXE_PMD)-include += sxe_dcb.h diff --git a/drivers/net/sxe/base/sxe_queue_common.c b/drivers/net/sxe/base/sxe_queue_common.c index 1470fb8e5c..f2af7923e8 100644 --- a/drivers/net/sxe/base/sxe_queue_common.c +++ b/drivers/net/sxe/base/sxe_queue_common.c @@ -22,6 +22,10 @@ #include "sxe_logs.h" #include "sxe_regs.h" #include "sxe.h" +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#include "sxe_vec_common.h" +#include +#endif #include "sxe_queue_common.h" #include "sxe_queue.h" @@ -66,6 +70,10 @@ s32 __rte_cold __sxe_rx_queue_setup(struct rx_setup *rx_setup, bool is_vf) u16 len; u64 offloads; s32 ret = 0; +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + struct sxe_adapter *pf_adapter = dev->data->dev_private; + struct sxevf_adapter *vf_adapter = dev->data->dev_private; +#endif PMD_INIT_FUNC_TRACE(); @@ -170,6 +178,23 @@ s32 __rte_cold __sxe_rx_queue_setup(struct rx_setup *rx_setup, bool is_vf) "dma_addr=0x%" SXE_PRIX64, rxq->buffer_ring, rxq->sc_buffer_ring, rxq->desc_ring, rxq->base_addr); + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + if (!rte_is_power_of_2(desc_num)) { + PMD_LOG_DEBUG(INIT, "queue[%d] doesn't meet Vector Rx " + "preconditions - canceling the feature for " + "the whole port[%d]", + rxq->queue_id, rxq->port_id); + if (is_vf) + vf_adapter->rx_vec_allowed = false; + else + pf_adapter->rx_vec_allowed = false; + + } else { + sxe_rxq_vec_setup(rxq); + } +#endif + dev->data->rx_queues[queue_idx] = rxq; sxe_rx_queue_init(*rx_setup->rx_batch_alloc_allowed, rxq); @@ -265,6 +290,9 @@ void __sxe_recycle_rxq_info_get(struct rte_eth_dev *dev, u16 queue_id, struct rte_eth_recycle_rxq_info *q_info) { struct sxe_rx_queue *rxq; +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + struct sxe_adapter *adapter = dev->data->dev_private; +#endif rxq = dev->data->rx_queues[queue_id]; @@ -273,8 +301,22 @@ void __sxe_recycle_rxq_info_get(struct rte_eth_dev *dev, u16 queue_id, q_info->mbuf_ring_size = rxq->ring_depth; q_info->receive_tail = &rxq->processing_idx; +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + if (adapter->rx_vec_allowed) { +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM) + q_info->refill_requirement = rxq->realloc_num; + q_info->refill_head = &rxq->realloc_start; +#endif + } else { + q_info->refill_requirement = rxq->batch_alloc_size; + q_info->refill_head = &rxq->batch_alloc_trigger; + } +#else q_info->refill_requirement = rxq->batch_alloc_size; q_info->refill_head = &rxq->batch_alloc_trigger; +#endif + + return; } #endif #endif @@ -302,7 +344,20 @@ s32 __sxe_tx_done_cleanup(void *tx_queue, u32 free_cnt) struct sxe_tx_queue *txq = (struct sxe_tx_queue *)tx_queue; if (txq->offloads == 0 && txq->rs_thresh >= RTE_PMD_SXE_MAX_TX_BURST) { +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + if (txq->rs_thresh <= RTE_SXE_MAX_TX_FREE_BUF_SZ && +#ifndef DPDK_19_11_6 + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128 && +#endif + (rte_eal_process_type() != RTE_PROC_PRIMARY || + txq->buffer_ring_vec != NULL)) { + ret = sxe_tx_done_cleanup_vec(txq, free_cnt); + } else{ + ret = sxe_tx_done_cleanup_simple(txq, free_cnt); + } +#else ret = sxe_tx_done_cleanup_simple(txq, free_cnt); +#endif } else { ret = sxe_tx_done_cleanup_full(txq, free_cnt); diff --git a/drivers/net/sxe/base/sxe_rx_common.c b/drivers/net/sxe/base/sxe_rx_common.c index aa830c89d7..8baed167a0 100644 --- a/drivers/net/sxe/base/sxe_rx_common.c +++ b/drivers/net/sxe/base/sxe_rx_common.c @@ -23,6 +23,10 @@ #include "sxe_errno.h" #include "sxe_irq.h" #include "sxe_rx_common.h" +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#include "sxe_vec_common.h" +#include "rte_vect.h" +#endif static inline void sxe_rx_resource_prefetch(u16 next_idx, struct sxe_rx_buffer *buf_ring, @@ -34,12 +38,70 @@ static inline void sxe_rx_resource_prefetch(u16 next_idx, rte_sxe_prefetch(&desc_ring[next_idx]); rte_sxe_prefetch(&buf_ring[next_idx]); } + } +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#if defined DPDK_23_11_3 || defined DPDK_24_11_1 +#ifndef DPDK_23_7 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM) +static void sxe_recycle_rx_descriptors_refill_vec(void *rx_queue, u16 nb_mbufs) +{ + struct sxe_rx_queue *rxq = rx_queue; + struct sxe_rx_buffer *rxep; + volatile union sxe_rx_data_desc *rxdp; + u16 rx_id; + u64 paddr; + u64 dma_addr; + u16 i; + + rxdp = rxq->desc_ring + rxq->realloc_start; + rxep = &rxq->buffer_ring[rxq->realloc_start]; + + for (i = 0; i < nb_mbufs; i++) { + paddr = (rxep[i].mbuf)->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr = rte_cpu_to_le_64(paddr); + rxdp[i].read.hdr_addr = 0; + rxdp[i].read.pkt_addr = dma_addr; + } + + rxq->realloc_start += nb_mbufs; + if (rxq->realloc_start >= rxq->ring_depth) + rxq->realloc_start = 0; + + rxq->realloc_num -= nb_mbufs; + + rx_id = (u16)((rxq->realloc_start == 0) ? + (rxq->ring_depth - 1) : (rxq->realloc_start - 1)); + + SXE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, rx_id); +} +#endif +#endif +#endif +#endif + void __rte_cold __sxe_rx_function_set(struct rte_eth_dev *dev, bool rx_batch_alloc_allowed, bool *rx_vec_allowed) { +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + u16 i, is_using_sse; + + if (sxe_rx_vec_condition_check(dev) || +#ifndef DPDK_19_11_6 + !rx_batch_alloc_allowed || + rte_vect_get_max_simd_bitwidth() < RTE_VECT_SIMD_128 +#else + !rx_batch_alloc_allowed +#endif + ) { + PMD_LOG_DEBUG(INIT, "Port[%d] doesn't meet Vector Rx " + "preconditions", dev->data->port_id); + *rx_vec_allowed = false; + } +#else UNUSED(rx_vec_allowed); +#endif if (dev->data->lro) { if (rx_batch_alloc_allowed) { @@ -52,7 +114,29 @@ void __rte_cold __sxe_rx_function_set(struct rte_eth_dev *dev, dev->rx_pkt_burst = sxe_single_alloc_lro_pkts_recv; } } else if (dev->data->scattered_rx) { +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + if (*rx_vec_allowed) { + PMD_LOG_DEBUG(INIT, "Using Vector Scattered Rx " + "callback (port=%d).", + dev->data->port_id); +#if defined DPDK_23_11_3 || defined DPDK_24_11_1 +#ifndef DPDK_23_7 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM) + dev->recycle_rx_descriptors_refill = sxe_recycle_rx_descriptors_refill_vec; +#endif +#endif +#endif + dev->rx_pkt_burst = sxe_scattered_pkts_vec_recv; + +#endif + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + + } else if (rx_batch_alloc_allowed) { +#else if (rx_batch_alloc_allowed) { +#endif + PMD_LOG_DEBUG(INIT, "Using a Scattered with bulk " "allocation callback (port=%d).", dev->data->port_id); @@ -67,7 +151,24 @@ void __rte_cold __sxe_rx_function_set(struct rte_eth_dev *dev, dev->rx_pkt_burst = sxe_single_alloc_lro_pkts_recv; } - } else if (rx_batch_alloc_allowed) { + } + #if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + else if (*rx_vec_allowed) { + PMD_LOG_DEBUG(INIT, "Vector rx enabled, please make sure RX " + "burst size no less than %d (port=%d).", + SXE_DESCS_PER_LOOP, + dev->data->port_id); +#if defined DPDK_23_11_3 || defined DPDK_24_11_1 +#ifndef DPDK_23_7 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM) + dev->recycle_rx_descriptors_refill = sxe_recycle_rx_descriptors_refill_vec; +#endif +#endif +#endif + dev->rx_pkt_burst = sxe_pkts_vec_recv; + } +#endif + else if (rx_batch_alloc_allowed) { PMD_LOG_DEBUG(INIT, "Rx Burst Bulk Alloc Preconditions are " "satisfied. Rx Burst Bulk Alloc function " "will be used on port=%d.", @@ -82,6 +183,19 @@ void __rte_cold __sxe_rx_function_set(struct rte_eth_dev *dev, dev->rx_pkt_burst = sxe_pkts_recv; } + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + is_using_sse = + (dev->rx_pkt_burst == sxe_scattered_pkts_vec_recv || + dev->rx_pkt_burst == sxe_pkts_vec_recv); + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + struct sxe_rx_queue *rxq = dev->data->rx_queues[i]; + + rxq->is_using_sse = is_using_sse; + } +#endif + } #if defined DPDK_20_11_5 || defined DPDK_19_11_6 @@ -127,7 +241,15 @@ s32 __sxe_rx_descriptor_status(void *rx_queue, u16 offset) ret = -EINVAL; goto l_end; } - hold_num = rxq->hold_num; + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#if defined(RTE_ARCH_X86) + if (rxq->is_using_sse) + hold_num = rxq->realloc_num; + else +#endif +#endif + hold_num = rxq->hold_num; if (offset >= rxq->ring_depth - hold_num) { ret = RTE_ETH_RX_DESC_UNAVAIL; goto l_end; @@ -268,6 +390,16 @@ const u32 *__sxe_dev_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of goto l_end; } +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#if defined(RTE_ARCH_X86) + if (dev->rx_pkt_burst == sxe_pkts_vec_recv || + dev->rx_pkt_burst == sxe_scattered_pkts_vec_recv) { + *no_of_elements = RTE_DIM(ptypes_arr); + ptypes = ptypes_arr; + } +#endif +#endif + l_end: return ptypes; } @@ -300,6 +432,15 @@ const u32 *__sxe_dev_supported_ptypes_get(struct rte_eth_dev *dev) goto l_end; } +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#if defined(RTE_ARCH_X86) + if (dev->rx_pkt_burst == sxe_pkts_vec_recv || + dev->rx_pkt_burst == sxe_scattered_pkts_vec_recv) { + ptypes = ptypes_arr; + } +#endif +#endif + l_end: return ptypes; } diff --git a/drivers/net/sxe/meson.build b/drivers/net/sxe/meson.build index 0e89676375..ecf64ea524 100644 --- a/drivers/net/sxe/meson.build +++ b/drivers/net/sxe/meson.build @@ -2,6 +2,9 @@ # Copyright (C), 2022, Linkdata Technology Co., Ltd. cflags += ['-DSXE_DPDK'] cflags += ['-DSXE_HOST_DRIVER'] +cflags += ['-DSXE_DPDK_L4_FEATURES'] +cflags += ['-DSXE_DPDK_SRIOV'] +cflags += ['-DSXE_DPDK_SIMD'] #subdir('base') #objs = [base_objs] @@ -32,6 +35,12 @@ sources = files( testpmd_sources = files('sxe_testpmd.c') +if arch_subdir == 'x86' + sources += files('pf/sxe_vec_sse.c') +elif arch_subdir == 'arm' + sources += files('pf/sxe_vec_neon.c') +endif + includes += include_directories('base') includes += include_directories('pf') includes += include_directories('include/sxe/') diff --git a/drivers/net/sxe/pf/sxe.h b/drivers/net/sxe/pf/sxe.h index c7dafd0e75..c9c71a0c90 100644 --- a/drivers/net/sxe/pf/sxe.h +++ b/drivers/net/sxe/pf/sxe.h @@ -66,6 +66,9 @@ struct sxe_adapter { struct sxe_dcb_context dcb_ctxt; bool rx_batch_alloc_allowed; +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + bool rx_vec_allowed; +#endif s8 name[PCI_PRI_STR_SIZE + 1]; u32 mtu; diff --git a/drivers/net/sxe/pf/sxe_ethdev.c b/drivers/net/sxe/pf/sxe_ethdev.c index f3ac4cbfc8..46d7f0dbf7 100644 --- a/drivers/net/sxe/pf/sxe_ethdev.c +++ b/drivers/net/sxe/pf/sxe_ethdev.c @@ -98,6 +98,11 @@ static s32 sxe_dev_configure(struct rte_eth_dev *dev) /* Default use batch alloc */ adapter->rx_batch_alloc_allowed = true; + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD + adapter->rx_vec_allowed = true; +#endif + l_end: return ret; } diff --git a/drivers/net/sxe/pf/sxe_rx.c b/drivers/net/sxe/pf/sxe_rx.c index 232fab0ab1..8504e1ac43 100644 --- a/drivers/net/sxe/pf/sxe_rx.c +++ b/drivers/net/sxe/pf/sxe_rx.c @@ -26,6 +26,9 @@ #include "sxe_errno.h" #include "sxe_irq.h" #include "sxe_ethdev.h" +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#include "sxe_vec_common.h" +#endif #include "sxe_rx_common.h" #define SXE_LRO_HDR_SIZE 128 diff --git a/drivers/net/sxe/pf/sxe_vec_common.h b/drivers/net/sxe/pf/sxe_vec_common.h new file mode 100644 index 0000000000..d3571dbf5b --- /dev/null +++ b/drivers/net/sxe/pf/sxe_vec_common.h @@ -0,0 +1,325 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C), 2022, Linkdata Technology Co., Ltd. + */ +#ifndef __SXE_VEC_COMMON_H__ +#define __SXE_VEC_COMMON_H__ + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#include +#include + +#if defined DPDK_20_11_5 || defined DPDK_19_11_6 +#include +#include +#elif defined DPDK_21_11_5 +#include +#include +#include +#else +#include +#include +#include +#endif +#include "sxe.h" +#include "sxe_rx.h" + +#define RTE_SXE_MAX_TX_FREE_BUF_SZ 64 +#define SXE_TXD_STAT_DD 0x00000001 + +static __rte_always_inline s32 +sxe_tx_bufs_vec_free(struct sxe_tx_queue *txq) +{ + struct sxe_tx_buffer_vec *txep; + u32 status; + u32 n; + u32 i; + s32 ret; + s32 nb_free = 0; + struct rte_mbuf *m, *free[RTE_SXE_MAX_TX_FREE_BUF_SZ]; + + status = txq->desc_ring[txq->next_dd].wb.status; + if (!(status & SXE_TXD_STAT_DD)) { + ret = 0; + goto out; + } + + n = txq->rs_thresh; + + txep = &txq->buffer_ring_vec[txq->next_dd - (n - 1)]; + m = rte_pktmbuf_prefree_seg(txep[0].mbuf); + + if (likely(m != NULL)) { + free[0] = m; + nb_free = 1; + for (i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (likely(m != NULL)) { + if (likely(m->pool == free[0]->pool)) { + free[nb_free++] = m; + } else { + rte_mempool_put_bulk(free[0]->pool, + (void *)free, nb_free); + free[0] = m; + nb_free = 1; + } + } + } + rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); + } else { + for (i = 1; i < n; i++) { + m = rte_pktmbuf_prefree_seg(txep[i].mbuf); + if (m != NULL) + rte_mempool_put(m->pool, m); + } + } + + txq->desc_free_num = (u16)(txq->desc_free_num + txq->rs_thresh); + txq->next_dd = (u16)(txq->next_dd + txq->rs_thresh); + if (txq->next_dd >= txq->ring_depth) + txq->next_dd = (u16)(txq->rs_thresh - 1); + + ret = txq->rs_thresh; +out: + return ret; +} + +static inline u16 +sxe_packets_reassemble(sxe_rx_queue_s *rxq, struct rte_mbuf **rx_bufs, + u16 bufs_num, u8 *split_flags) +{ + struct rte_mbuf *pkts[bufs_num]; + struct rte_mbuf *start = rxq->pkt_first_seg; + struct rte_mbuf *end = rxq->pkt_last_seg; + u32 pkt_idx, buf_idx; + + for (buf_idx = 0, pkt_idx = 0; buf_idx < bufs_num; buf_idx++) { + if (end != NULL) { + end->next = rx_bufs[buf_idx]; + rx_bufs[buf_idx]->data_len += rxq->crc_len; + + start->nb_segs++; + start->pkt_len += rx_bufs[buf_idx]->data_len; + end = end->next; + + if (!split_flags[buf_idx]) { + start->hash = end->hash; + start->ol_flags = end->ol_flags; + start->pkt_len -= rxq->crc_len; + if (end->data_len > rxq->crc_len) { + end->data_len -= rxq->crc_len; + } else { + struct rte_mbuf *secondlast = start; + + start->nb_segs--; + while (secondlast->next != end) + secondlast = secondlast->next; + + secondlast->data_len -= (rxq->crc_len - + end->data_len); + secondlast->next = NULL; + rte_pktmbuf_free_seg(end); + } + pkts[pkt_idx++] = start; + start = NULL; + end = NULL; + } + } else { + if (!split_flags[buf_idx]) { + pkts[pkt_idx++] = rx_bufs[buf_idx]; + continue; + } + start = rx_bufs[buf_idx]; + end = rx_bufs[buf_idx]; + rx_bufs[buf_idx]->data_len += rxq->crc_len; + rx_bufs[buf_idx]->pkt_len += rxq->crc_len; + } + } + + rxq->pkt_first_seg = start; + rxq->pkt_last_seg = end; + memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts))); + + return pkt_idx; +} + +static inline void +sxe_rx_vec_mbufs_release(sxe_rx_queue_s *rxq) +{ + u16 i; + + if (rxq->buffer_ring == NULL || rxq->realloc_num >= rxq->ring_depth) + return; + + if (rxq->realloc_num == 0) { + for (i = 0; i < rxq->ring_depth; i++) { + if (rxq->buffer_ring[i].mbuf != NULL) + rte_pktmbuf_free_seg(rxq->buffer_ring[i].mbuf); + } + } else { + for (i = rxq->processing_idx; + i != rxq->realloc_start; + i = (i + 1) % rxq->ring_depth) { + if (rxq->buffer_ring[i].mbuf != NULL) + rte_pktmbuf_free_seg(rxq->buffer_ring[i].mbuf); + } + } + + rxq->realloc_num = rxq->ring_depth; + + memset(rxq->buffer_ring, 0, sizeof(rxq->buffer_ring[0]) * rxq->ring_depth); +} + +static inline s32 +sxe_default_rxq_vec_setup(sxe_rx_queue_s *rxq) +{ + uintptr_t p; + struct rte_mbuf mbuf = { .buf_addr = 0 }; + + mbuf.nb_segs = 1; + mbuf.data_off = RTE_PKTMBUF_HEADROOM; + mbuf.port = rxq->port_id; + rte_mbuf_refcnt_set(&mbuf, 1); + + rte_compiler_barrier(); + p = (uintptr_t)&mbuf.rearm_data; + rxq->mbuf_init_value = *(u64 *)p; + + return 0; +} + +static inline s32 +sxe_default_rx_vec_condition_check(struct rte_eth_dev *dev) +{ + s32 ret = 0; + +#ifndef RTE_LIBRTE_IEEE1588 + struct rte_eth_fdir_conf *fnav_conf = SXE_DEV_FNAV_CONF(dev); + if (fnav_conf->mode != RTE_FDIR_MODE_NONE) + ret = -1; +#else + RTE_SET_USED(dev); + ret = -1; +#endif + + return ret; +} + +static __rte_always_inline void +sxe_vec_mbuf_fill(struct sxe_tx_buffer_vec *buffer_ring, + struct rte_mbuf **tx_pkts, u16 pkts_num) +{ + s32 i; + + for (i = 0; i < pkts_num; ++i) + buffer_ring[i].mbuf = tx_pkts[i]; +} + +static inline void +sxe_tx_queue_vec_init(sxe_tx_queue_s *txq) +{ + u16 i; + volatile sxe_tx_data_desc_u *txd; + static const sxe_tx_data_desc_u zeroed_desc = { {0} }; + struct sxe_tx_buffer_vec *tx_buffer = txq->buffer_ring_vec; + + for (i = 0; i < txq->ring_depth; i++) + txq->desc_ring[i] = zeroed_desc; + + for (i = 0; i < txq->ring_depth; i++) { + txd = &txq->desc_ring[i]; + txd->wb.status = SXE_TX_DESC_STAT_DD; + tx_buffer[i].mbuf = NULL; + } + + txq->ctx_curr = 0; + txq->desc_used_num = 0; + txq->desc_free_num = txq->ring_depth - 1; + txq->next_to_use = 0; + txq->next_to_clean = txq->ring_depth - 1; + txq->next_dd = txq->rs_thresh - 1; + txq->next_rs = txq->rs_thresh - 1; + memset((void *)&txq->ctx_cache, 0, + SXE_CTXT_DESC_NUM * sizeof(struct sxe_ctxt_info)); +} + +static inline void +sxe_tx_mbufs_vec_release(sxe_tx_queue_s *txq) +{ + u16 i; + struct sxe_tx_buffer_vec *tx_buffer; + const u16 max_desc = (u16)(txq->ring_depth - 1); + + if (txq->buffer_ring_vec == NULL || txq->desc_free_num == max_desc) + return; + + for (i = txq->next_dd - (txq->rs_thresh - 1); + i != txq->next_to_use; + i = (i + 1) % txq->ring_depth) { + tx_buffer = &txq->buffer_ring_vec[i]; + rte_pktmbuf_free_seg(tx_buffer->mbuf); + } + txq->desc_free_num = max_desc; + + for (i = 0; i < txq->ring_depth; i++) { + tx_buffer = &txq->buffer_ring_vec[i]; + tx_buffer->mbuf = NULL; + } +} + +static inline void +sxe_tx_buffer_ring_vec_free(sxe_tx_queue_s *txq) +{ + if (txq == NULL) + return; + + if (txq->buffer_ring_vec != NULL) { + rte_free(txq->buffer_ring_vec - 1); + txq->buffer_ring_vec = NULL; + } +} + +static inline s32 +sxe_default_txq_vec_setup(sxe_tx_queue_s *txq, + const struct sxe_txq_ops *txq_ops) +{ + s32 ret = 0; + + if (txq->buffer_ring_vec == NULL) { + ret = -1; + goto l_out; + } + + txq->buffer_ring_vec = txq->buffer_ring_vec + 1; + txq->ops = txq_ops; + +l_out: + return ret; +} + +static inline int +sxe_tx_done_cleanup_vec(sxe_tx_queue_s *txq, u32 free_cnt) +{ + UNUSED(txq); + UNUSED(free_cnt); + + return -ENOTSUP; +} + +s32 sxe_txq_vec_setup(sxe_tx_queue_s *txq); + +s32 sxe_rx_vec_condition_check(struct rte_eth_dev *dev); + +s32 sxe_rxq_vec_setup(sxe_rx_queue_s *rxq); + +void sxe_rx_queue_vec_mbufs_release(sxe_rx_queue_s *rxq); + +u16 sxe_scattered_pkts_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, u16 pkts_num); + +u16 sxe_pkts_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, u16 pkts_num); + +u16 +__sxe_pkts_vector_xmit(void *tx_queue, struct rte_mbuf **tx_pkts, + u16 pkts_num); + +#endif +#endif diff --git a/drivers/net/sxe/pf/sxe_vec_neon.c b/drivers/net/sxe/pf/sxe_vec_neon.c new file mode 100644 index 0000000000..8e425e8487 --- /dev/null +++ b/drivers/net/sxe/pf/sxe_vec_neon.c @@ -0,0 +1,760 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C), 2022, Linkdata Technology Co., Ltd. + */ + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#include +#include "sxe_dpdk_version.h" +#if defined DPDK_20_11_5 || defined DPDK_19_11_6 +#include +#else +#include +#endif +#include + +#include +#include "sxe_vec_common.h" + +#define RTE_SXE_DESCS_PER_LOOP 4 +#define SXE_PACKET_TYPE_MASK_TUNNEL 0xFF +#define SXE_PACKET_TYPE_SHIFT 0x04 +#define SXE_RXDADV_ERR_TCPE 0x40000000 +#define SXE_VPMD_DESC_EOP_MASK 0x02020202 +#define SXE_UINT8_BIT (CHAR_BIT * sizeof(u8)) + +#pragma GCC diagnostic ignored "-Wcast-qual" + +static inline void +sxe_rxq_rearm(struct sxe_rx_queue *rxq) +{ + s32 i; + u16 rx_id; + volatile union sxe_rx_data_desc *rxdp; + struct sxe_rx_buffer *rxep = &rxq->buffer_ring[rxq->realloc_start]; + struct rte_mbuf *mb0, *mb1; + uint64x2_t dma_addr0, dma_addr1; + uint64x2_t zero = vdupq_n_u64(0); + u64 paddr; + uint8x8_t p; + + rxdp = rxq->desc_ring + rxq->realloc_start; + + if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, + (void *)rxep, + RTE_PMD_SXE_MAX_RX_BURST) < 0)) { + if (rxq->realloc_num + RTE_PMD_SXE_MAX_RX_BURST >= + rxq->ring_depth) { + for (i = 0; i < RTE_SXE_DESCS_PER_LOOP; i++) { + rxep[i].mbuf = &rxq->fake_mbuf; + vst1q_u64((u64 *)&rxdp[i].read, + zero); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + RTE_PMD_SXE_MAX_RX_BURST; + return; + } + + p = vld1_u8((u8 *)&rxq->mbuf_init_value); + + for (i = 0; i < RTE_PMD_SXE_MAX_RX_BURST; i += 2, rxep += 2) { + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + + vst1_u8((u8 *)&mb0->rearm_data, p); + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr0 = vsetq_lane_u64(paddr, zero, 0); + + vst1q_u64((u64 *)&rxdp++->read, dma_addr0); + + vst1_u8((u8 *)&mb1->rearm_data, p); + paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM; + dma_addr1 = vsetq_lane_u64(paddr, zero, 0); + vst1q_u64((u64 *)&rxdp++->read, dma_addr1); + } + + rxq->realloc_start += RTE_PMD_SXE_MAX_RX_BURST; + if (rxq->realloc_start >= rxq->ring_depth) + rxq->realloc_start = 0; + + rxq->realloc_num -= RTE_PMD_SXE_MAX_RX_BURST; + + rx_id = (u16)((rxq->realloc_start == 0) ? + (rxq->ring_depth - 1) : (rxq->realloc_start - 1)); + + sxe_write_addr(rx_id, rxq->rdt_reg_addr); +} + +#if defined DPDK_22_11_3 || defined DPDK_21_11_5 || defined DPDK_23_11_3 || defined DPDK_24_11_1 + +static inline void +sxe_desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2, + uint8x16_t staterr, u8 vlan_flags, u16 udp_p_flag, + struct rte_mbuf **rx_pkts) +{ + u16 udp_p_flag_hi; + uint8x16_t ptype, udp_csum_skip; + uint32x4_t temp_udp_csum_skip = {0, 0, 0, 0}; + uint8x16_t vtag_lo, vtag_hi, vtag; + uint8x16_t temp_csum; + uint32x4_t csum = {0, 0, 0, 0}; + + union { + u16 e[4]; + u64 word; + } vol; + + const uint8x16_t rsstype_msk = { + 0x0F, 0x0F, 0x0F, 0x0F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + const uint8x16_t rss_flags = { + 0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, + 0, RTE_MBUF_F_RX_RSS_HASH, 0, RTE_MBUF_F_RX_RSS_HASH, + RTE_MBUF_F_RX_RSS_HASH, 0, 0, 0, + 0, 0, 0, RTE_MBUF_F_RX_FDIR}; + + const uint8x16_t vlan_csum_msk = { + SXE_RXD_STAT_VP, SXE_RXD_STAT_VP, + SXE_RXD_STAT_VP, SXE_RXD_STAT_VP, + 0, 0, 0, 0, + 0, 0, 0, 0, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24}; + + const uint8x16_t vlan_csum_map_lo = { + RTE_MBUF_F_RX_IP_CKSUM_GOOD, + RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + 0, 0, 0, 0, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + 0, 0, 0, 0}; + + const uint8x16_t vlan_csum_map_hi = { + RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), 0, + RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), 0, + 0, 0, 0, 0, + RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), 0, + RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), 0, + 0, 0, 0, 0}; + + udp_p_flag_hi = udp_p_flag >> 8; + + const uint8x16_t udp_hdr_p_msk = { + 0, 0, 0, 0, + udp_p_flag_hi, udp_p_flag_hi, udp_p_flag_hi, udp_p_flag_hi, + 0, 0, 0, 0, + 0, 0, 0, 0}; + + const uint8x16_t udp_csum_bad_shuf = { + 0xFF, ~(u8)RTE_MBUF_F_RX_L4_CKSUM_BAD, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0}; + + ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0]; + + udp_csum_skip = vandq_u8(ptype, udp_hdr_p_msk); + + temp_udp_csum_skip = vcopyq_laneq_u32(temp_udp_csum_skip, 0, + vreinterpretq_u32_u8(udp_csum_skip), 1); + + ptype = vandq_u8(ptype, rsstype_msk); + ptype = vqtbl1q_u8(rss_flags, ptype); + + vtag = vandq_u8(staterr, vlan_csum_msk); + + temp_csum = vshrq_n_u8(vtag, 6); + + csum = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u8(temp_csum), 3), csum, 0); + vtag = vorrq_u8(vreinterpretq_u8_u32(csum), vtag); + + vtag_hi = vqtbl1q_u8(vlan_csum_map_hi, vtag); + vtag_hi = vshrq_n_u8(vtag_hi, 7); + + vtag_lo = vqtbl1q_u8(vlan_csum_map_lo, vtag); + vtag_lo = vorrq_u8(ptype, vtag_lo); + + udp_csum_skip = vshrq_n_u8(vreinterpretq_u8_u32(temp_udp_csum_skip), 1); + udp_csum_skip = vqtbl1q_u8(udp_csum_bad_shuf, udp_csum_skip); + vtag_lo = vandq_u8(vtag_lo, udp_csum_skip); + + vtag = vzipq_u8(vtag_lo, vtag_hi).val[0]; + vol.word = vgetq_lane_u64(vreinterpretq_u64_u8(vtag), 0); + + rx_pkts[0]->ol_flags = vol.e[0]; + rx_pkts[1]->ol_flags = vol.e[1]; + rx_pkts[2]->ol_flags = vol.e[2]; + rx_pkts[3]->ol_flags = vol.e[3]; +} + +#elif defined DPDK_20_11_5 + +#define SXE_VTAG_SHIFT (3) + +static inline void +sxe_desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2, + uint8x16_t staterr, struct rte_mbuf **rx_pkts) +{ + uint8x16_t ptype; + uint8x16_t vtag; + + union { + u8 e[4]; + u32 word; + } vol; + + const uint8x16_t pkttype_msk = { + PKT_RX_VLAN, PKT_RX_VLAN, + PKT_RX_VLAN, PKT_RX_VLAN, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + const uint8x16_t rsstype_msk = { + 0x0F, 0x0F, 0x0F, 0x0F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + const uint8x16_t rss_flags = { + 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, + 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, + PKT_RX_RSS_HASH, 0, 0, 0, + 0, 0, 0, PKT_RX_FDIR}; + + ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0]; + ptype = vandq_u8(ptype, rsstype_msk); + ptype = vqtbl1q_u8(rss_flags, ptype); + + vtag = vshrq_n_u8(staterr, SXE_VTAG_SHIFT); + vtag = vandq_u8(vtag, pkttype_msk); + vtag = vorrq_u8(ptype, vtag); + + vol.word = vgetq_lane_u32(vreinterpretq_u32_u8(vtag), 0); + + rx_pkts[0]->ol_flags = vol.e[0]; + rx_pkts[1]->ol_flags = vol.e[1]; + rx_pkts[2]->ol_flags = vol.e[2]; + rx_pkts[3]->ol_flags = vol.e[3]; +} + +#elif defined DPDK_19_11_6 + +static inline void +sxe_desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2, + uint8x16_t staterr, u8 vlan_flags, struct rte_mbuf **rx_pkts) +{ + uint8x16_t ptype; + uint8x16_t vtag_lo, vtag_hi, vtag; + uint8x16_t temp_csum; + uint32x4_t csum = {0, 0, 0, 0}; + + union { + u16 e[4]; + u64 word; + } vol; + + const uint8x16_t rsstype_msk = { + 0x0F, 0x0F, 0x0F, 0x0F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + const uint8x16_t rss_flags = { + 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, + 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, + PKT_RX_RSS_HASH, 0, 0, 0, + 0, 0, 0, PKT_RX_FDIR}; + + const uint8x16_t vlan_csum_msk = { + SXE_RXD_STAT_VP, SXE_RXD_STAT_VP, + SXE_RXD_STAT_VP, SXE_RXD_STAT_VP, + 0, 0, 0, 0, + 0, 0, 0, 0, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24, + (SXE_RXDADV_ERR_TCPE | SXE_RXDADV_ERR_IPE) >> 24}; + + const uint8x16_t vlan_csum_map_lo = { + PKT_RX_IP_CKSUM_GOOD, + PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, + PKT_RX_IP_CKSUM_BAD, + PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, + 0, 0, 0, 0, + vlan_flags | PKT_RX_IP_CKSUM_GOOD, + vlan_flags | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, + vlan_flags | PKT_RX_IP_CKSUM_BAD, + vlan_flags | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, + 0, 0, 0, 0}; + + const uint8x16_t vlan_csum_map_hi = { + PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, + PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, + 0, 0, 0, 0, + PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, + PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, + 0, 0, 0, 0}; + + ptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0]; + ptype = vandq_u8(ptype, rsstype_msk); + ptype = vqtbl1q_u8(rss_flags, ptype); + + vtag = vandq_u8(staterr, vlan_csum_msk); + + temp_csum = vshrq_n_u8(vtag, 6); + + csum = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u8(temp_csum), 3), csum, 0); + vtag = vorrq_u8(vreinterpretq_u8_u32(csum), vtag); + + vtag_hi = vqtbl1q_u8(vlan_csum_map_hi, vtag); + vtag_hi = vshrq_n_u8(vtag_hi, 7); + + vtag_lo = vqtbl1q_u8(vlan_csum_map_lo, vtag); + vtag_lo = vorrq_u8(ptype, vtag_lo); + + vtag = vzipq_u8(vtag_lo, vtag_hi).val[0]; + vol.word = vgetq_lane_u64(vreinterpretq_u64_u8(vtag), 0); + + rx_pkts[0]->ol_flags = vol.e[0]; + rx_pkts[1]->ol_flags = vol.e[1]; + rx_pkts[2]->ol_flags = vol.e[2]; + rx_pkts[3]->ol_flags = vol.e[3]; +} +#endif + +static inline u32 +sxe_get_packet_type(u32 pkt_info, + u32 etqf_check, + u32 tunnel_check) +{ + u32 rte; + + if (etqf_check) { + rte = RTE_PTYPE_UNKNOWN; + goto out; + } + + if (tunnel_check) { + pkt_info &= SXE_PACKET_TYPE_MASK_TUNNEL; + rte = sxe_ptype_table_tn[pkt_info]; + goto out; + } + + pkt_info &= SXE_PACKET_TYPE_MASK; + rte = sxe_ptype_table[pkt_info]; + +out: + return rte; +} + +static inline void +sxe_desc_to_ptype_v(uint64x2_t descs[4], u16 pkt_type_mask, + struct rte_mbuf **rx_pkts) +{ + uint32x4_t etqf_check, tunnel_check; + uint32x4_t etqf_mask = vdupq_n_u32(0x8000); + uint32x4_t tunnel_mask = vdupq_n_u32(0x10000); + uint32x4_t ptype_mask = vdupq_n_u32((u32)pkt_type_mask); + uint32x4_t ptype0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]), + vreinterpretq_u32_u64(descs[2])).val[0]; + uint32x4_t ptype1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]), + vreinterpretq_u32_u64(descs[3])).val[0]; + + ptype0 = vzipq_u32(ptype0, ptype1).val[0]; + + etqf_check = vandq_u32(ptype0, etqf_mask); + tunnel_check = vandq_u32(ptype0, tunnel_mask); + + ptype0 = vandq_u32(vshrq_n_u32(ptype0, SXE_PACKET_TYPE_SHIFT), + ptype_mask); + + rx_pkts[0]->packet_type = + sxe_get_packet_type(vgetq_lane_u32(ptype0, 0), + vgetq_lane_u32(etqf_check, 0), + vgetq_lane_u32(tunnel_check, 0)); + rx_pkts[1]->packet_type = + sxe_get_packet_type(vgetq_lane_u32(ptype0, 1), + vgetq_lane_u32(etqf_check, 1), + vgetq_lane_u32(tunnel_check, 1)); + rx_pkts[2]->packet_type = + sxe_get_packet_type(vgetq_lane_u32(ptype0, 2), + vgetq_lane_u32(etqf_check, 2), + vgetq_lane_u32(tunnel_check, 2)); + rx_pkts[3]->packet_type = + sxe_get_packet_type(vgetq_lane_u32(ptype0, 3), + vgetq_lane_u32(etqf_check, 3), + vgetq_lane_u32(tunnel_check, 3)); +} + +static inline u16 +sxe_recv_raw_pkts_vec(struct sxe_rx_queue *rxq, struct rte_mbuf **rx_pkts, + u16 nb_pkts, u8 *split_packet) +{ + volatile union sxe_rx_data_desc *rxdp; + struct sxe_rx_buffer *sw_ring; + u16 nb_pkts_recd; + s32 pos; + u16 rte; + uint8x16_t shuf_msk = { + 0xFF, 0xFF, + 0xFF, 0xFF, + 12, 13, + 0xFF, 0xFF, + 12, 13, + 14, 15, + 4, 5, 6, 7 + }; + uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0, + rxq->crc_len, 0, 0, 0}; + + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_SXE_DESCS_PER_LOOP); + + rxdp = rxq->desc_ring + rxq->processing_idx; + + rte_prefetch_non_temporal(rxdp); + + if (rxq->realloc_num > RTE_PMD_SXE_MAX_RX_BURST) + sxe_rxq_rearm(rxq); + + if (!(rxdp->wb.upper.status_error & + rte_cpu_to_le_32(SXE_RXDADV_STAT_DD))) { + rte = 0; + goto out; + } + + sw_ring = &rxq->buffer_ring[rxq->processing_idx]; + + RTE_BUILD_BUG_ON((RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED) > UINT8_MAX); + +#if defined DPDK_22_11_3 || defined DPDK_21_11_5 || defined DPDK_23_11_3 || defined DPDK_24_11_1 + u16 udp_p_flag = SXE_RXDADV_PKTTYPE_UDP; + u8 vlan_flags = rxq->vlan_flags & UINT8_MAX; +#elif defined DPDK_19_11_6 + u8 vlan_flags = rxq->vlan_flags & UINT8_MAX; +#endif + + for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; + pos += RTE_SXE_DESCS_PER_LOOP, + rxdp += RTE_SXE_DESCS_PER_LOOP) { + uint64x2_t descs[RTE_SXE_DESCS_PER_LOOP]; + uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; + uint8x16x2_t sterr_tmp1, sterr_tmp2; + uint64x2_t mbp1, mbp2; + uint8x16_t staterr; + uint16x8_t tmp; + u32 stat; + + mbp1 = vld1q_u64((u64 *)&sw_ring[pos]); + + vst1q_u64((u64 *)&rx_pkts[pos], mbp1); + + mbp2 = vld1q_u64((u64 *)&sw_ring[pos + 2]); + + descs[0] = vld1q_u64((u64 *)(rxdp)); + descs[1] = vld1q_u64((u64 *)(rxdp + 1)); + descs[2] = vld1q_u64((u64 *)(rxdp + 2)); + descs[3] = vld1q_u64((u64 *)(rxdp + 3)); + + vst1q_u64((u64 *)&rx_pkts[pos + 2], mbp2); + + if (split_packet) { + rte_mbuf_prefetch_part2(rx_pkts[pos]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); + } + + pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); + pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); + + pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); + pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); + + sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]), + vreinterpretq_u8_u64(descs[3])); + sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]), + vreinterpretq_u8_u64(descs[2])); + + staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; + +#if defined DPDK_22_11_3 || defined DPDK_21_11_5 || defined DPDK_23_11_3 || defined DPDK_24_11_1 + sxe_desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, vlan_flags, + udp_p_flag, &rx_pkts[pos]); +#elif defined DPDK_19_11_6 + sxe_desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, vlan_flags, + &rx_pkts[pos]); +#else + sxe_desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, &rx_pkts[pos]); +#endif + + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); + pkt_mb4 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); + pkt_mb3 = vreinterpretq_u8_u16(tmp); + + vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, + pkt_mb4); + vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, + pkt_mb3); + + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); + pkt_mb2 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); + pkt_mb1 = vreinterpretq_u8_u16(tmp); + + if (split_packet) { + stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0); + *(s32 *)split_packet = ~stat & SXE_VPMD_DESC_EOP_MASK; + + split_packet += RTE_SXE_DESCS_PER_LOOP; + } + + staterr = vshlq_n_u8(staterr, SXE_UINT8_BIT - 1); + staterr = vreinterpretq_u8_s8 + (vshrq_n_s8(vreinterpretq_s8_u8(staterr), + SXE_UINT8_BIT - 1)); + stat = ~vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0); + + rte_prefetch_non_temporal(rxdp + RTE_SXE_DESCS_PER_LOOP); + + vst1q_u8((u8 *)&rx_pkts[pos + 1]->rx_descriptor_fields1, + pkt_mb2); + vst1q_u8((u8 *)&rx_pkts[pos]->rx_descriptor_fields1, + pkt_mb1); + + sxe_desc_to_ptype_v(descs, rxq->pkt_type_mask, &rx_pkts[pos]); + + if (unlikely(stat == 0)) { + nb_pkts_recd += RTE_SXE_DESCS_PER_LOOP; + } else { +#if (defined DPDK_23_11_3 && !defined DPDK_23_7) || defined DPDK_24_11_1 + nb_pkts_recd += rte_ctz32(stat) / SXE_UINT8_BIT; +#else + nb_pkts_recd += __builtin_ctz(stat) / SXE_UINT8_BIT; +#endif + break; + } + } + + rxq->processing_idx = (u16)(rxq->processing_idx + nb_pkts_recd); + rxq->processing_idx = (u16)(rxq->processing_idx & (rxq->ring_depth - 1)); + rxq->realloc_num = (u16)(rxq->realloc_num + nb_pkts_recd); + + rte = nb_pkts_recd; + +out: + return rte; +} + +u16 sxe_pkts_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, u16 nb_pkts) +{ + return sxe_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); +} + +static u16 sxe_recv_scattered_burst_vec(void *rx_queue, + struct rte_mbuf **rx_pkts, u16 nb_pkts) +{ + u32 i = 0; + struct sxe_rx_queue *rxq = rx_queue; + u8 split_flags[RTE_PMD_SXE_MAX_RX_BURST] = {0}; + + u16 nb_bufs = sxe_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, + split_flags); + if (nb_bufs == 0) + goto l_out; + + const u64 *split_fl64 = (u64 *)split_flags; + if (rxq->pkt_first_seg == NULL && + split_fl64[0] == 0 && split_fl64[1] == 0 && + split_fl64[2] == 0 && split_fl64[3] == 0) + goto l_out; + + if (rxq->pkt_first_seg == NULL) { + while (i < nb_bufs && !split_flags[i]) + i++; + if (i == nb_bufs) + goto l_out; + rxq->pkt_first_seg = rx_pkts[i]; + } + + nb_bufs = i + sxe_packets_reassemble(rxq, &rx_pkts[i], nb_bufs - i, + &split_flags[i]); + +l_out: + return nb_bufs; +} + +u16 +sxe_scattered_pkts_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, + u16 nb_pkts) +{ + u16 retval = 0; + + while (nb_pkts > RTE_PMD_SXE_MAX_RX_BURST) { + u16 burst; + + burst = sxe_recv_scattered_burst_vec(rx_queue, + rx_pkts + retval, + RTE_PMD_SXE_MAX_RX_BURST); + retval += burst; + nb_pkts -= burst; + if (burst < RTE_PMD_SXE_MAX_RX_BURST) + goto l_out; + } + + retval += sxe_recv_scattered_burst_vec(rx_queue, + rx_pkts + retval, + nb_pkts); +l_out: + return retval; +} + +static inline void +sxe_single_vec_desc_fill(volatile union sxe_tx_data_desc *txdp, + struct rte_mbuf *pkt, u64 flags) +{ + uint64x2_t descriptor = { + pkt->buf_iova + pkt->data_off, + (u64)pkt->pkt_len << 46 | flags | pkt->data_len}; + + vst1q_u64((u64 *)&txdp->read, descriptor); +} + +static inline void +sxe_vec_desc_fill(volatile union sxe_tx_data_desc *txdp, + struct rte_mbuf **pkt, u16 nb_pkts, u64 flags) +{ + s32 i; + + for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) + sxe_single_vec_desc_fill(txdp, *pkt, flags); +} + +u16 __sxe_pkts_vector_xmit(void *tx_queue, struct rte_mbuf **tx_pkts, + u16 nb_pkts) +{ + struct sxe_tx_queue *txq = (struct sxe_tx_queue *)tx_queue; + volatile union sxe_tx_data_desc *txdp; + struct sxe_tx_buffer_vec *txep; + u16 n, nb_commit, tx_id; + u64 flags = SXE_TX_DESC_FLAGS; + u64 rs = SXE_TX_DESC_RS_MASK | SXE_TX_DESC_FLAGS; + s32 i; + + nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); + + if (txq->desc_free_num < txq->free_thresh) + sxe_tx_bufs_vec_free(txq); + + nb_pkts = (u16)RTE_MIN(txq->desc_free_num, nb_pkts); + nb_commit = nb_pkts; + if (unlikely(nb_pkts == 0)) + goto l_out; + + tx_id = txq->next_to_use; + txdp = &txq->desc_ring[tx_id]; + txep = &txq->buffer_ring_vec[tx_id]; + + txq->desc_free_num = (u16)(txq->desc_free_num - nb_pkts); + + n = (u16)(txq->ring_depth - tx_id); + if (nb_commit >= n) { + sxe_vec_mbuf_fill(txep, tx_pkts, n); + + for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) + sxe_single_vec_desc_fill(txdp, *tx_pkts, flags); + + sxe_single_vec_desc_fill(txdp, *tx_pkts++, rs); + + nb_commit = (u16)(nb_commit - n); + + tx_id = 0; + txq->next_rs = (u16)(txq->rs_thresh - 1); + + txdp = &txq->desc_ring[tx_id]; + txep = &txq->buffer_ring_vec[tx_id]; + } + + sxe_vec_mbuf_fill(txep, tx_pkts, nb_commit); + sxe_vec_desc_fill(txdp, tx_pkts, nb_commit, flags); + + tx_id = (u16)(tx_id + nb_commit); + if (tx_id > txq->next_rs) { + txq->desc_ring[txq->next_rs].read.cmd_type_len |= + rte_cpu_to_le_32(SXE_TX_DESC_RS_MASK); + txq->next_rs = (u16)(txq->next_rs + + txq->rs_thresh); + } + + txq->next_to_use = tx_id; + + sxe_write_addr(txq->next_to_use, txq->tdt_reg_addr); + +l_out: + return nb_pkts; +} + +static void __rte_cold +sxe_tx_queue_release_mbufs_vec(struct sxe_tx_queue *txq) +{ + sxe_tx_mbufs_vec_release(txq); +} + +void __rte_cold +sxe_rx_queue_vec_mbufs_release(struct sxe_rx_queue *rxq) +{ + sxe_rx_vec_mbufs_release(rxq); +} + +static void __rte_cold +sxe_tx_free_swring(struct sxe_tx_queue *txq) +{ + sxe_tx_buffer_ring_vec_free(txq); +} + +static void __rte_cold +sxe_reset_tx_queue(struct sxe_tx_queue *txq) +{ + sxe_tx_queue_vec_init(txq); +} + +static const struct sxe_txq_ops vec_txq_ops = { + .init = sxe_reset_tx_queue, + .mbufs_release = sxe_tx_queue_release_mbufs_vec, + .buffer_ring_free = sxe_tx_free_swring, +}; + +s32 __rte_cold +sxe_rxq_vec_setup(struct sxe_rx_queue *rxq) +{ + return sxe_default_rxq_vec_setup(rxq); +} + +s32 __rte_cold +sxe_txq_vec_setup(struct sxe_tx_queue *txq) +{ + return sxe_default_txq_vec_setup(txq, &vec_txq_ops); +} + +s32 __rte_cold +sxe_rx_vec_condition_check(struct rte_eth_dev *dev) +{ + struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; + + if (rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM) + return -1; + + return sxe_default_rx_vec_condition_check(dev); +} + +#endif diff --git a/drivers/net/sxe/pf/sxe_vec_sse.c b/drivers/net/sxe/pf/sxe_vec_sse.c new file mode 100644 index 0000000000..70b74ba945 --- /dev/null +++ b/drivers/net/sxe/pf/sxe_vec_sse.c @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C), 2022, Linkdata Technology Co., Ltd. + */ + +#if defined SXE_DPDK_L4_FEATURES && defined SXE_DPDK_SIMD +#include +#include "sxe_dpdk_version.h" +#if defined DPDK_20_11_5 || defined DPDK_19_11_6 +#include +#else +#include +#endif +#include +#ifdef DPDK_24_11_1 +#include +#else +#include +#endif + +#include "sxe_vec_common.h" +#include "sxe_compat_version.h" + +#pragma GCC diagnostic ignored "-Wcast-qual" + +#define SXE_MAX_TX_FREE_BUF_SZ 64 + +static inline void +sxe_rxq_realloc(sxe_rx_queue_s *rx_queue) +{ + s32 i; + u16 rx_index; + volatile union sxe_rx_data_desc *desc_ring; + sxe_rx_buffer_s *buf_ring = + &rx_queue->buffer_ring[rx_queue->realloc_start]; + struct rte_mbuf *mbuf_0, *mbuf_1; + __m128i head_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, + RTE_PKTMBUF_HEADROOM); + __m128i dma_addr0, dma_addr1; + + const __m128i addr_mask = _mm_set_epi64x(0, UINT64_MAX); + + desc_ring = rx_queue->desc_ring + rx_queue->realloc_start; + + if (rte_mempool_get_bulk(rx_queue->mb_pool, + (void *)buf_ring, + RTE_PMD_SXE_MAX_RX_BURST) < 0) { + if (rx_queue->realloc_num + RTE_PMD_SXE_MAX_RX_BURST >= + rx_queue->ring_depth) { + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < SXE_DESCS_PER_LOOP; i++) { + buf_ring[i].mbuf = &rx_queue->fake_mbuf; + _mm_store_si128((__m128i *)&desc_ring[i].read, + dma_addr0); + } + } + rte_eth_devices[rx_queue->port_id].data->rx_mbuf_alloc_failed += + RTE_PMD_SXE_MAX_RX_BURST; + return; + } + + for (i = 0; i < RTE_PMD_SXE_MAX_RX_BURST; i += 2, buf_ring += 2) { + __m128i vaddr0, vaddr1; + + mbuf_0 = buf_ring[0].mbuf; + mbuf_1 = buf_ring[1].mbuf; + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + + vaddr0 = _mm_loadu_si128((__m128i *)&mbuf_0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mbuf_1->buf_addr); + + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); + + dma_addr0 = _mm_add_epi64(dma_addr0, head_room); + dma_addr1 = _mm_add_epi64(dma_addr1, head_room); + + dma_addr0 = _mm_and_si128(dma_addr0, addr_mask); + dma_addr1 = _mm_and_si128(dma_addr1, addr_mask); + + _mm_store_si128((__m128i *)&desc_ring++->read, dma_addr0); + _mm_store_si128((__m128i *)&desc_ring++->read, dma_addr1); + } + + rx_queue->realloc_start += RTE_PMD_SXE_MAX_RX_BURST; + if (rx_queue->realloc_start >= rx_queue->ring_depth) + rx_queue->realloc_start = 0; + + rx_queue->realloc_num -= RTE_PMD_SXE_MAX_RX_BURST; + + rx_index = (u16)((rx_queue->realloc_start == 0) ? + (rx_queue->ring_depth - 1) : (rx_queue->realloc_start - 1)); + + SXE_PCI_REG_WC_WRITE_RELAXED(rx_queue->rdt_reg_addr, rx_index); +} + +static inline void +sxe_desc_to_olflags(__m128i descs[4], __m128i mbuf_init, u8 vlan_flags, + u16 udp_p_flag, struct rte_mbuf **rx_pkts) +{ + __m128i ptype0, ptype1, vtype0, vtype1, csum, udp_csum_skip; + __m128i rearm0, rearm1, rearm2, rearm3; + + const __m128i rsstype_mask = _mm_set_epi16 + (0x0000, 0x0000, 0x0000, 0x0000, + 0x000F, 0x000F, 0x000F, 0x000F); + + const __m128i ol_flags_mask = _mm_set_epi16 + (0x0000, 0x0000, 0x0000, 0x0000, + 0x00FF, 0x00FF, 0x00FF, 0x00FF); + + const __m128i rss_flags = _mm_set_epi8(RTE_MBUF_F_RX_FDIR, 0, 0, 0, + 0, 0, 0, RTE_MBUF_F_RX_RSS_HASH, + RTE_MBUF_F_RX_RSS_HASH, 0, RTE_MBUF_F_RX_RSS_HASH, 0, + RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, 0); + + const __m128i vlan_csum_mask = _mm_set_epi16 + ((SXE_RXDADV_ERR_L4E | SXE_RXDADV_ERR_IPE) >> 16, + (SXE_RXDADV_ERR_L4E | SXE_RXDADV_ERR_IPE) >> 16, + (SXE_RXDADV_ERR_L4E | SXE_RXDADV_ERR_IPE) >> 16, + (SXE_RXDADV_ERR_L4E | SXE_RXDADV_ERR_IPE) >> 16, + SXE_RXD_STAT_VP, SXE_RXD_STAT_VP, + SXE_RXD_STAT_VP, SXE_RXD_STAT_VP); + + const __m128i vlan_csum_map_low = _mm_set_epi8 + (0, 0, 0, 0, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + vlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD, + 0, 0, 0, 0, + RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD, + RTE_MBUF_F_RX_IP_CKSUM_GOOD); + + const __m128i vlan_csum_map_high = _mm_set_epi8 + (0, 0, 0, 0, + 0, RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), 0, + RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), + 0, 0, 0, 0, + 0, RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8), 0, + RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(u8)); + + const __m128i udp_hdr_p_msk = _mm_set_epi16 + (0, 0, 0, 0, + udp_p_flag, udp_p_flag, udp_p_flag, udp_p_flag); + + const __m128i udp_csum_bad_shuf = _mm_set_epi8 + (0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ~(u8)RTE_MBUF_F_RX_L4_CKSUM_BAD, 0xFF); + + ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); + ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); + + vtype0 = _mm_unpackhi_epi16(descs[0], descs[1]); + vtype1 = _mm_unpackhi_epi16(descs[2], descs[3]); + + ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); + + udp_csum_skip = _mm_and_si128(ptype0, udp_hdr_p_msk); + + ptype0 = _mm_and_si128(ptype0, rsstype_mask); + + ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); + + vtype1 = _mm_unpacklo_epi32(vtype0, vtype1); + vtype1 = _mm_and_si128(vtype1, vlan_csum_mask); + + csum = _mm_srli_epi16(vtype1, 14); + + csum = _mm_srli_si128(csum, 8); + vtype1 = _mm_or_si128(csum, vtype1); + + vtype0 = _mm_shuffle_epi8(vlan_csum_map_high, vtype1); + vtype0 = _mm_slli_epi16(vtype0, sizeof(u8)); + + vtype1 = _mm_shuffle_epi8(vlan_csum_map_low, vtype1); + vtype1 = _mm_and_si128(vtype1, ol_flags_mask); + vtype1 = _mm_or_si128(vtype0, vtype1); + + vtype1 = _mm_or_si128(ptype0, vtype1); + + udp_csum_skip = _mm_srli_epi16(udp_csum_skip, 9); + udp_csum_skip = _mm_shuffle_epi8(udp_csum_bad_shuf, udp_csum_skip); + vtype1 = _mm_and_si128(vtype1, udp_csum_skip); + + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtype1, 8), 0x10); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtype1, 6), 0x10); + rearm2 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtype1, 4), 0x10); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtype1, 2), 0x10); + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_mbuf, rearm_data) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + + _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); + _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); + _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); + _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); +} + +static inline u32 sxe_packet_type_get(int index, + u32 pkt_info, + u32 etqf_check) +{ + if (etqf_check & (0x02 << (index * SXE_DESCS_PER_LOOP))) + return RTE_PTYPE_UNKNOWN; + + pkt_info &= SXE_PACKET_TYPE_MASK; + return sxe_ptype_table[pkt_info]; +} + +static inline void +sxe_desc_to_ptype_vec(__m128i descs[4], u16 pkt_type_mask, + struct rte_mbuf **rx_pkts) +{ + __m128i etqf_mask = _mm_set_epi64x(0x800000008000LL, 0x800000008000LL); + __m128i ptype_mask = _mm_set_epi32(pkt_type_mask, + pkt_type_mask, pkt_type_mask, pkt_type_mask); + + u32 etqf_check, pkt_info; + + __m128i ptype0 = _mm_unpacklo_epi32(descs[0], descs[2]); + __m128i ptype1 = _mm_unpacklo_epi32(descs[1], descs[3]); + + ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); + + etqf_check = _mm_movemask_epi8(_mm_and_si128(ptype0, etqf_mask)); + + ptype0 = _mm_and_si128(_mm_srli_epi32(ptype0, SXE_RXDADV_PKTTYPE_ETQF_SHIFT), + ptype_mask); + + + pkt_info = _mm_extract_epi32(ptype0, 0); + rx_pkts[0]->packet_type = + sxe_packet_type_get(0, pkt_info, etqf_check); + pkt_info = _mm_extract_epi32(ptype0, 1); + rx_pkts[1]->packet_type = + sxe_packet_type_get(1, pkt_info, etqf_check); + pkt_info = _mm_extract_epi32(ptype0, 2); + rx_pkts[2]->packet_type = + sxe_packet_type_get(2, pkt_info, etqf_check); + pkt_info = _mm_extract_epi32(ptype0, 3); + rx_pkts[3]->packet_type = + sxe_packet_type_get(3, pkt_info, etqf_check); +} + +static inline u16 +sxe_raw_pkts_vec_recv(sxe_rx_queue_s *rx_queue, struct rte_mbuf **rx_pkts, + u16 pkts_num, u8 *split_packet) +{ + volatile union sxe_rx_data_desc *desc_ring; + sxe_rx_buffer_s *buffer_ring; + u16 pkts_recd_num; + s32 pos; + u64 var; + __m128i shuf_msk; + __m128i crc_adjust = _mm_set_epi16 + (0, 0, 0, + -rx_queue->crc_len, + 0, + -rx_queue->crc_len, + 0, 0 + ); + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + __m128i dd_check, eop_check; + __m128i mbuf_init; + u8 vlan_flags; + u16 udp_p_flag = 0; + + pkts_num = RTE_MIN(pkts_num, RTE_PMD_SXE_MAX_RX_BURST); + + pkts_num = RTE_ALIGN_FLOOR(pkts_num, SXE_DESCS_PER_LOOP); + + desc_ring = rx_queue->desc_ring + rx_queue->processing_idx; + + rte_prefetch0(desc_ring); + + if (rx_queue->realloc_num > RTE_PMD_SXE_MAX_RX_BURST) + sxe_rxq_realloc(rx_queue); + + if (!(desc_ring->wb.upper.status_error & + rte_cpu_to_le_32(SXE_RXDADV_STAT_DD))) { + pkts_recd_num = 0; + goto l_out; + } + + udp_p_flag = SXE_RXDADV_PKTTYPE_UDP; + + dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); + + eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); + + shuf_msk = _mm_set_epi8 + (7, 6, 5, 4, + 15, 14, + 13, 12, + 0xFF, 0xFF, + 13, 12, + 0xFF, 0xFF, + 0xFF, 0xFF + ); + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + + mbuf_init = _mm_set_epi64x(0, rx_queue->mbuf_init_value); + + buffer_ring = &rx_queue->buffer_ring[rx_queue->processing_idx]; + + RTE_BUILD_BUG_ON((RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED) > UINT8_MAX); + vlan_flags = rx_queue->vlan_flags & UINT8_MAX; + + for (pos = 0, pkts_recd_num = 0; pos < pkts_num; + pos += SXE_DESCS_PER_LOOP, + desc_ring += SXE_DESCS_PER_LOOP) { + __m128i descs[SXE_DESCS_PER_LOOP]; + __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; + __m128i zero, staterr, state_err1, state_err2; + __m128i mbp1; +#if defined(RTE_ARCH_X86_64) + __m128i mbp2; +#endif + + mbp1 = _mm_loadu_si128((__m128i *)&buffer_ring[pos]); + + descs[3] = _mm_loadu_si128((__m128i *)(desc_ring + 3)); + rte_compiler_barrier(); + + _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); + +#if defined(RTE_ARCH_X86_64) + mbp2 = _mm_loadu_si128((__m128i *)&buffer_ring[pos + 2]); +#endif + + descs[2] = _mm_loadu_si128((__m128i *)(desc_ring + 2)); + rte_compiler_barrier(); + descs[1] = _mm_loadu_si128((__m128i *)(desc_ring + 1)); + rte_compiler_barrier(); + descs[0] = _mm_loadu_si128((__m128i *)(desc_ring)); + +#if defined(RTE_ARCH_X86_64) + _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2); +#endif + + if (split_packet) { + rte_mbuf_prefetch_part2(rx_pkts[pos]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); + rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); + } + + rte_compiler_barrier(); + + pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); + pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); + pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); + pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); + + state_err2 = _mm_unpackhi_epi32(descs[3], descs[2]); + state_err1 = _mm_unpackhi_epi32(descs[1], descs[0]); + + sxe_desc_to_olflags(descs, mbuf_init, vlan_flags, udp_p_flag, + &rx_pkts[pos]); + + pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); + pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); + + zero = _mm_xor_si128(dd_check, dd_check); + + staterr = _mm_unpacklo_epi32(state_err1, state_err2); + + _mm_storeu_si128((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, + pkt_mb4); + _mm_storeu_si128((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, + pkt_mb3); + + pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); + pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); + + if (split_packet) { + __m128i eop_shuf_mask = _mm_set_epi8 + (0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0x04, 0x0C, 0x00, 0x08 + ); + + __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); + eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); + *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); + split_packet += SXE_DESCS_PER_LOOP; + } + + staterr = _mm_and_si128(staterr, dd_check); + + staterr = _mm_packs_epi32(staterr, zero); + + _mm_storeu_si128((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, + pkt_mb2); + _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, + pkt_mb1); + + sxe_desc_to_ptype_vec(descs, rx_queue->pkt_type_mask, &rx_pkts[pos]); + +#if (defined DPDK_23_11_3 && !defined DPDK_23_7) || defined DPDK_24_11_1 + var = rte_popcount64(_mm_cvtsi128_si64(staterr)); +#else + var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); +#endif + pkts_recd_num += var; + if (likely(var != SXE_DESCS_PER_LOOP)) + break; + } + + rx_queue->processing_idx = (u16)(rx_queue->processing_idx + pkts_recd_num); + rx_queue->processing_idx = (u16)(rx_queue->processing_idx & (rx_queue->ring_depth - 1)); + rx_queue->realloc_num = (u16)(rx_queue->realloc_num + pkts_recd_num); + +l_out: + return pkts_recd_num; +} + +u16 +sxe_pkts_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, u16 pkts_num) +{ + return sxe_raw_pkts_vec_recv(rx_queue, rx_pkts, pkts_num, NULL); +} + +static u16 +sxe_scattered_burst_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, + u16 pkts_num) +{ + u16 i = 0; + u16 bufs_num; + sxe_rx_queue_s *rxq = rx_queue; + u8 split_flags[RTE_PMD_SXE_MAX_RX_BURST] = {0}; + + bufs_num = sxe_raw_pkts_vec_recv(rxq, rx_pkts, pkts_num, + split_flags); + if (bufs_num == 0) + goto l_out; + + const u64 *split_flag_64 = (u64 *)split_flags; + if (rxq->pkt_first_seg == NULL && + split_flag_64[0] == 0 && split_flag_64[1] == 0 && + split_flag_64[2] == 0 && split_flag_64[3] == 0) + goto l_out; + + if (rxq->pkt_first_seg == NULL) { + while (i < bufs_num && !split_flags[i]) + i++; + if (i == bufs_num) + goto l_out; + rxq->pkt_first_seg = rx_pkts[i]; + } + + bufs_num = i + sxe_packets_reassemble(rxq, &rx_pkts[i], bufs_num - i, + &split_flags[i]); + +l_out: + return bufs_num; +} + +u16 +sxe_scattered_pkts_vec_recv(void *rx_queue, struct rte_mbuf **rx_pkts, + u16 pkts_num) +{ + u16 ret = 0; + + while (pkts_num > RTE_PMD_SXE_MAX_RX_BURST) { + u16 burst; + + burst = sxe_scattered_burst_vec_recv(rx_queue, + rx_pkts + ret, + RTE_PMD_SXE_MAX_RX_BURST); + ret += burst; + pkts_num -= burst; + if (burst < RTE_PMD_SXE_MAX_RX_BURST) + goto l_out; + } + + ret += sxe_scattered_burst_vec_recv(rx_queue, + rx_pkts + ret, + pkts_num); +l_out: + return ret; +} + +void __rte_cold +sxe_rx_queue_vec_mbufs_release(sxe_rx_queue_s *rx_queue) +{ + sxe_rx_vec_mbufs_release(rx_queue); +} + +s32 __rte_cold +sxe_rxq_vec_setup(sxe_rx_queue_s *rx_queue) +{ + return sxe_default_rxq_vec_setup(rx_queue); +} + +s32 __rte_cold +sxe_rx_vec_condition_check(struct rte_eth_dev *dev) +{ + return sxe_default_rx_vec_condition_check(dev); +} + +static inline void +sxe_single_vec_desc_fill(volatile sxe_tx_data_desc_u *desc_ring, + struct rte_mbuf *pkts, u64 flags) +{ + __m128i descriptor = _mm_set_epi64x((u64)pkts->pkt_len << 46 | + flags | pkts->data_len, + pkts->buf_iova + pkts->data_off); + _mm_store_si128((__m128i *)&desc_ring->read, descriptor); +} + +static inline void +sxe_vec_desc_fill(volatile sxe_tx_data_desc_u *desc_ring, + struct rte_mbuf **pkts, u16 pkts_num, u64 flags) +{ + s32 i; + + for (i = 0; i < pkts_num; ++i, ++desc_ring, ++pkts) + sxe_single_vec_desc_fill(desc_ring, *pkts, flags); +} + +u16 +__sxe_pkts_vector_xmit(void *tx_queue, struct rte_mbuf **tx_pkts, + u16 pkts_num) +{ + sxe_tx_queue_s *txq = (sxe_tx_queue_s *)tx_queue; + volatile sxe_tx_data_desc_u *desc_ring; + struct sxe_tx_buffer_vec *buffer_ring; + u16 n, commit_num, ntu, xmit_pkts_num; + u64 flags = SXE_TX_DESC_FLAGS; + u64 rs_flags = SXE_TX_DESC_RS_MASK | SXE_TX_DESC_FLAGS; + s32 i; + + if (txq->desc_free_num < txq->free_thresh) + sxe_tx_bufs_vec_free(txq); + + xmit_pkts_num = RTE_MIN(pkts_num, txq->rs_thresh); + xmit_pkts_num = (u16)RTE_MIN(txq->desc_free_num, xmit_pkts_num); + + commit_num = xmit_pkts_num; + if (unlikely(commit_num == 0)) + goto l_out; + + ntu = txq->next_to_use; + desc_ring = &txq->desc_ring[ntu]; + buffer_ring = &txq->buffer_ring_vec[ntu]; + + txq->desc_free_num = (u16)(txq->desc_free_num - xmit_pkts_num); + + n = (u16)(txq->ring_depth - ntu); + if (commit_num >= n) { + sxe_vec_mbuf_fill(buffer_ring, tx_pkts, n); + + for (i = 0; i < n - 1; ++i, ++tx_pkts, ++desc_ring) + sxe_single_vec_desc_fill(desc_ring, *tx_pkts, flags); + + sxe_single_vec_desc_fill(desc_ring, *tx_pkts++, rs_flags); + + commit_num = (u16)(commit_num - n); + + ntu = 0; + txq->next_rs = (u16)(txq->rs_thresh - 1); + + desc_ring = &txq->desc_ring[ntu]; + buffer_ring = &txq->buffer_ring_vec[ntu]; + } + + sxe_vec_mbuf_fill(buffer_ring, tx_pkts, commit_num); + + sxe_vec_desc_fill(desc_ring, tx_pkts, commit_num, flags); + + ntu = (u16)(ntu + commit_num); + if (ntu > txq->next_rs) { + txq->desc_ring[txq->next_rs].read.cmd_type_len |= + rte_cpu_to_le_32(SXE_TX_DESC_RS_MASK); + txq->next_rs = (u16)(txq->next_rs + + txq->rs_thresh); + } + + txq->next_to_use = ntu; + rte_wmb(); + rte_write32_wc_relaxed((rte_cpu_to_le_32(txq->next_to_use)), + txq->tdt_reg_addr); + +l_out: + return xmit_pkts_num; +} + +static void __rte_cold +sxe_tx_queue_init(sxe_tx_queue_s *tx_queue) +{ + sxe_tx_queue_vec_init(tx_queue); +} + +static void __rte_cold +sxe_tx_queue_mbufs_release(sxe_tx_queue_s *tx_queue) +{ + sxe_tx_mbufs_vec_release(tx_queue); +} + +static void __rte_cold +sxe_tx_buffer_ring_free(sxe_tx_queue_s *tx_queue) +{ + sxe_tx_buffer_ring_vec_free(tx_queue); +} + +static const struct sxe_txq_ops txq_vec_ops = { + .init = sxe_tx_queue_init, + .mbufs_release = sxe_tx_queue_mbufs_release, + .buffer_ring_free = sxe_tx_buffer_ring_free, +}; + +s32 __rte_cold +sxe_txq_vec_setup(sxe_tx_queue_s *tx_queue) +{ + return sxe_default_txq_vec_setup(tx_queue, &txq_vec_ops); +} + +#endif -- 2.18.4