From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id 97410A052A; Sun, 19 Jul 2020 06:11:48 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id CBA3F1C001; Sun, 19 Jul 2020 06:11:47 +0200 (CEST) Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id 0A0021BF5E for ; Sun, 19 Jul 2020 06:11:45 +0200 (CEST) Received: from Internal Mail-Server by MTLPINE1 (envelope-from akozyrev@mellanox.com) with SMTP; 19 Jul 2020 07:11:44 +0300 Received: from pegasus02.mtr.labs.mlnx. (pegasus02.mtr.labs.mlnx [10.210.16.122]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id 06J4BisG003593; Sun, 19 Jul 2020 07:11:44 +0300 From: Alexander Kozyrev To: dev@dpdk.org Cc: rasland@mellanox.com, viacheslavo@mellanox.com Date: Sun, 19 Jul 2020 04:11:42 +0000 Message-Id: <20200719041142.14485-1-akozyrev@mellanox.com> X-Mailer: git-send-email 2.24.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [dpdk-dev] [PATCH] net/mlx5: implement vectorized MPRQ burst X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" MPRQ (Multi-Packet Rx Queue) processes one packet at the time using simple scalar instructions. MPRQ works by posting a single large buffer (consisted of multiple fixed-size strides) in order to receive multiple packets at once on this buffer. A Rx packet is then copied to a user-provided mbuf or PMD attaches the Rx packet to the mbuf by the pointer to an external buffer. There is an opportunity to speed up the packet receiving by processing 4 packets simultaneously using SIMD (single instruction, multiple data) extensions. Allocate mbufs in batches for every MPRQ buffer and process the packets in the groups of 4 until all the strides are exhausted. Then switch to another MPRQ buffer and repeat the process over again. The vectorized MPRQ burst routine is engaged automatically in case the mprq_en=1 devarg is specified and the vectorization is not disabled explicitly by providing rx_vec_en=0 devarg. There are two limitations: - LRO is not supported and scalar MPRQ is selected if it is on. - CQE compression is disabled in case vectorized MPRQ is engaged. Signed-off-by: Alexander Kozyrev --- drivers/net/mlx5/linux/mlx5_os.c | 4 + drivers/net/mlx5/mlx5_ethdev.c | 12 +- drivers/net/mlx5/mlx5_rxq.c | 80 +-- drivers/net/mlx5/mlx5_rxtx.c | 30 +- drivers/net/mlx5/mlx5_rxtx.h | 9 +- drivers/net/mlx5/mlx5_rxtx_vec.c | 38 +- drivers/net/mlx5/mlx5_rxtx_vec.h | 21 + drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 724 +++++++++++++++++++++++ drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 577 ++++++++++++++++++ drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 520 ++++++++++++++++ 10 files changed, 1968 insertions(+), 47 deletions(-) diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c index 742e2fba49..927fa07270 100644 --- a/drivers/net/mlx5/linux/mlx5_os.c +++ b/drivers/net/mlx5/linux/mlx5_os.c @@ -568,6 +568,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, cqe_comp = 0; else cqe_comp = 1; + if (config.mprq.enabled) + cqe_comp = 0; config.cqe_comp = cqe_comp; #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD /* Whether device supports 128B Rx CQE padding. */ @@ -973,6 +975,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, " setting default value (%u)", 1 << config.mprq.stride_num_n); } + if (config.mprq.stride_size_n) + config.rx_vec_en = false; if (config.mprq.stride_size_n && (config.mprq.stride_size_n > mprq_max_stride_size_n || config.mprq.stride_size_n < mprq_min_stride_size_n)) { diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index cefb45064e..f48e8ea293 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) if (dev->rx_pkt_burst == mlx5_rx_burst || dev->rx_pkt_burst == mlx5_rx_burst_mprq || - dev->rx_pkt_burst == mlx5_rx_burst_vec) + dev->rx_pkt_burst == mlx5_rx_burst_vec || + dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec) return ptypes; return NULL; } @@ -479,12 +480,19 @@ mlx5_select_rx_function(struct rte_eth_dev *dev) eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst; MLX5_ASSERT(dev != NULL); - if (mlx5_check_vec_rx_support(dev) > 0) { + if (mlx5_check_vec_rx_support(dev) > 0 && + mlx5_mprq_enabled(dev)) { + rx_pkt_burst = mlx5_rx_burst_mprq_vec; + DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx vectorized function", + dev->data->port_id); + } else if (mlx5_check_vec_rx_support(dev) > 0) { rx_pkt_burst = mlx5_rx_burst_vec; DRV_LOG(DEBUG, "port %u selected Rx vectorized function", dev->data->port_id); } else if (mlx5_mprq_enabled(dev)) { rx_pkt_burst = mlx5_rx_burst_mprq; + DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx function", + dev->data->port_id); } return rx_pkt_burst; } diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 67d996cabf..06e7650be9 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -164,7 +164,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) rxq->mprq_repl = buf; } DRV_LOG(DEBUG, - "port %u Rx queue %u allocated and configured %u segments", + "port %u Multi-Packet Rx queue %u allocated and configured %u segments", rxq->port_id, rxq->idx, wqe_n); return 0; error: @@ -176,7 +176,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) (*rxq->mprq_bufs)[i]); (*rxq->mprq_bufs)[i] = NULL; } - DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u failed, freed everything", rxq->port_id, rxq->idx); rte_errno = err; /* Restore rte_errno. */ return -rte_errno; @@ -194,11 +194,14 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) static int rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { + struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n; unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; unsigned int i; int err; + if (mlx5_rxq_mprq_enabled(rxq)) + elts_n *= (1U << rxq_ctrl->rxq.strd_num_n); /* Iterate on segments. */ for (i = 0; (i != elts_n); ++i) { struct rte_mbuf *buf; @@ -284,8 +287,10 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) { - return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? - rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl); + int ret = 0; + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + ret = rxq_alloc_elts_mprq(rxq_ctrl); + return (ret || rxq_alloc_elts_sprq(rxq_ctrl)); } /** @@ -304,7 +309,6 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) rxq->port_id, rxq->idx); if (rxq->mprq_bufs == NULL) return; - MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0); for (i = 0; (i != (1u << rxq->elts_n)); ++i) { if ((*rxq->mprq_bufs)[i] != NULL) mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]); @@ -326,15 +330,19 @@ static void rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; - const uint16_t q_n = (1 << rxq->elts_n); - const uint16_t q_mask = q_n - 1; - uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); + unsigned int q_n = (1 << rxq->elts_n); + uint16_t q_mask; + uint16_t used; uint16_t i; DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs", PORT_ID(rxq_ctrl->priv), rxq->idx); if (rxq->elts == NULL) return; + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + q_n *= (1U << rxq_ctrl->rxq.strd_num_n); + q_mask = q_n - 1; + used = q_n - (rxq->rq_ci - rxq->rq_pi); /** * Some mbuf in the Ring belongs to the application. They cannot be * freed. @@ -344,7 +352,7 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL; rxq->rq_pi = rxq->rq_ci; } - for (i = 0; (i != (1u << rxq->elts_n)); ++i) { + for (i = 0; (i != q_n); ++i) { if ((*rxq->elts)[i] != NULL) rte_pktmbuf_free_seg((*rxq->elts)[i]); (*rxq->elts)[i] = NULL; @@ -362,8 +370,7 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) { if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) rxq_free_elts_mprq(rxq_ctrl); - else - rxq_free_elts_sprq(rxq_ctrl); + rxq_free_elts_sprq(rxq_ctrl); } /** @@ -1793,20 +1800,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *tmpl; unsigned int mb_len = rte_pktmbuf_data_room_size(mp); - unsigned int mprq_stride_nums; - unsigned int mprq_stride_size; - unsigned int mprq_stride_cap; struct mlx5_dev_config *config = &priv->config; - /* - * Always allocate extra slots, even if eventually - * the vector Rx will not be used. - */ - uint16_t desc_n = - desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP; uint64_t offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads; unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO); - const int mprq_en = mlx5_check_mprq_support(dev) > 0; unsigned int max_rx_pkt_len = lro_on_queue ? dev->data->dev_conf.rxmode.max_lro_pkt_size : dev->data->dev_conf.rxmode.max_rx_pkt_len; @@ -1814,6 +1811,23 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, RTE_PKTMBUF_HEADROOM; unsigned int max_lro_size = 0; unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM; + const int mprq_en = mlx5_check_mprq_support(dev) > 0; + unsigned int mprq_stride_nums = config->mprq.stride_num_n ? + config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N; + unsigned int mprq_stride_size = non_scatter_min_mbuf_size <= + (1U << config->mprq.max_stride_size_n) ? + log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N; + unsigned int mprq_stride_cap = (config->mprq.stride_num_n ? + (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) * + (config->mprq.stride_size_n ? + (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size)); + /* + * Always allocate extra slots, even if eventually + * the vector Rx will not be used. + */ + uint16_t desc_n = desc + + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP * + (desc >> mprq_stride_nums); if (non_scatter_min_mbuf_size > mb_len && !(offloads & DEV_RX_OFFLOAD_SCATTER)) { @@ -1825,8 +1839,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, rte_errno = ENOSPC; return NULL; } - tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) + - desc_n * sizeof(struct rte_mbuf *), 0, socket); + tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, + sizeof(*tmpl) + + desc_n * sizeof(struct rte_mbuf *) + + (desc >> mprq_stride_nums) * + sizeof(struct mlx5_mprq_buf *), + 0, socket); if (!tmpl) { rte_errno = ENOMEM; return NULL; @@ -1840,15 +1858,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, tmpl->socket = socket; if (dev->data->dev_conf.intr_conf.rxq) tmpl->irq = 1; - mprq_stride_nums = config->mprq.stride_num_n ? - config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N; - mprq_stride_size = non_scatter_min_mbuf_size <= - (1U << config->mprq.max_stride_size_n) ? - log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N; - mprq_stride_cap = (config->mprq.stride_num_n ? - (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) * - (config->mprq.stride_size_n ? - (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size)); /* * This Rx queue can be configured as a Multi-Packet RQ if all of the * following conditions are met: @@ -1996,7 +2005,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, tmpl->rxq.rq_repl_thresh = MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n); tmpl->rxq.elts = - (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1); + (struct rte_mbuf *(*)[desc_n])(tmpl + 1); + if (mlx5_rxq_mprq_enabled(&tmpl->rxq)) { + tmpl->rxq.rq_repl_thresh = 1; + tmpl->rxq.mprq_bufs = + (struct mlx5_mprq_buf *(*)[desc])(tmpl + desc_n + 1); + } #ifndef RTE_ARCH_64 tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq; #endif diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 65239f9ffe..768a242518 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -614,6 +614,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); #else return -EINVAL; +#endif + } else if (pkt_burst == mlx5_rx_burst_mprq_vec) { +#if defined RTE_ARCH_X86_64 + snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector SSE"); +#elif defined RTE_ARCH_ARM64 + snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector Neon"); +#elif defined RTE_ARCH_PPC_64 + snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector AltiVec"); +#else + return -EINVAL; #endif } else { return -EINVAL; @@ -1075,7 +1085,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) { const uint16_t cqe_n = 1 << rxq->cqe_n; const uint16_t cqe_mask = cqe_n - 1; - const unsigned int wqe_n = 1 << rxq->elts_n; + unsigned int wqe_n = 1 << rxq->elts_n; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); union { @@ -1139,11 +1149,17 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) &sm)) return -1; if (vec) { - const uint16_t q_mask = wqe_n - 1; + uint16_t q_mask; uint16_t elt_idx; struct rte_mbuf **elt; int i; - unsigned int n = wqe_n - (rxq->rq_ci - + unsigned int n; + + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + wqe_n *= (1U << + rxq_ctrl->rxq.strd_num_n); + q_mask = wqe_n - 1; + n = wqe_n - (rxq->rq_ci - rxq->rq_pi); for (i = 0; i < (int)n; ++i) { @@ -1982,6 +1998,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, return 0; } +__rte_weak uint16_t +mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + __rte_weak int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) { diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 5116a15c33..3c44794d68 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -141,11 +141,8 @@ struct mlx5_rxq_data { uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */ volatile void *wqes; volatile struct mlx5_cqe(*cqes)[]; - RTE_STD_C11 - union { - struct rte_mbuf *(*elts)[]; - struct mlx5_mprq_buf *(*mprq_bufs)[]; - }; + struct rte_mbuf *(*elts)[]; + struct mlx5_mprq_buf *(*mprq_bufs)[]; struct rte_mempool *mp; struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */ @@ -518,6 +515,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data); int mlx5_check_vec_rx_support(struct rte_eth_dev *dev); uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n); +uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); /* mlx5_mr.c */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index 7fae2010f9..53dd229271 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -119,6 +119,40 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) return tn; } +/** + * DPDK callback for MPRQ vectorized RX. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + uint16_t nb_rx = 0; + uint16_t tn = 0; + uint64_t err = 0; + bool no_cq = false; + + do { + nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn, + &err, &no_cq); + if (unlikely(err | rxq->err_state)) + nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx); + tn += nb_rx; + if (unlikely(no_cq)) + break; + } while (tn != pkts_n); + return tn; +} + /** * Check a RX queue can support vectorized RX. * @@ -134,8 +168,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq) struct mlx5_rxq_ctrl *ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); - if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv))) - return -ENOTSUP; if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0) return -ENOTSUP; if (rxq->lro) @@ -160,8 +192,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev) if (!priv->config.rx_vec_en) return -ENOTSUP; - if (mlx5_mprq_enabled(dev)) - return -ENOTSUP; /* All the configured queues should support. */ for (i = 0; i < priv->rxqs_n; ++i) { struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h index 6ddcbfb0ad..305c5a596a 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -122,4 +122,25 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); } +static inline void +mlx5_rx_replenish_bulk_mprq_mbuf(struct mlx5_rxq_data *rxq, + uint16_t n, uint32_t rq_idx) +{ + const unsigned int strd_n = 1 << rxq->strd_num_n; + uint16_t elts_idx = rq_idx * strd_n + + rq_idx * MLX5_VPMD_DESCS_PER_LOOP; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + unsigned int i; + + n = RTE_MIN(n, strd_n - rxq->consumed_strd); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + rxq->rq_repl_thresh = 0; + /* Prevent overflowing into the next MPRQ mbufs. */ + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + (*rxq->elts)[elts_idx + strd_n + i] = &rxq->fake_mbuf; +} + #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h index f5414eebad..8fc3e1fd66 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h @@ -59,6 +59,97 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) pkts[pos] = elts[pos]; } +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + * @param buf + * MPRQ buffer to get packets from. + * @param buf rq_ci + * WQE index. + * @param strd_idx + * Stride number. + * @param comp + * Whether CQE is compressed or not. + */ +static inline void +rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t n, struct mlx5_mprq_buf *buf, + uint16_t rq_ci, uint16_t strd_idx, bool comp) +{ + const unsigned int strd_sz = 1 << rxq->strd_sz_n; + const unsigned int strd_n = 1 << rxq->strd_num_n; + const unsigned int strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + uint32_t offset; + void *addr; + int i = 0; + + if (comp) { + const uint16_t q_mask = (1 << rxq->cqe_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + vector unsigned char mbp; + + mbp = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&elts[pos + + rxq->consumed_strd]); + *(vector unsigned char *)&pkts[pos] = mbp; + } + if (n & 1) + pkts[pos] = elts[pos]; + } + + for (i = 0; i < n; ++i) { + offset = (strd_idx + i) * strd_sz + strd_shift; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); + if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len || + rxq->mprq_repl == NULL) { + rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *), + addr, pkts[i]->pkt_len); + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = strd_sz; + void *buf_addr; + /* Increment the refcnt of the whole chunk. */ + rte_atomic16_add_return(&buf->refcnt, 1); + MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); + buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(buf_addr, buf); + shinfo = &buf->shinfos[strd_idx]; + rte_mbuf_ext_refcnt_set(shinfo, 1); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova, + buf_len, shinfo); + /* Set mbuf head-room. */ + SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM); + DATA_LEN(pkts[i]) = pkts[i]->pkt_len; + } + } +} + + /** * Decompress a compressed completion and fill in mbufs in RX SW ring with data * extracted from the title completion descriptor. @@ -1136,4 +1227,637 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, return rcvd_pkt; } +static inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, + const unsigned int strd_n) +{ + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + void *addr; + + MLX5_ASSERT(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep, strd_n); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * @param[out] no_cq + * Pointer to a boolean. Set true if no new CQE seen. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, bool *no_cq) +{ + const unsigned int strd_n = 1 << rxq->strd_num_n; + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + const uint16_t e_n = 1 << rxq->elts_n; + const uint16_t e_mask = e_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_ci = rxq->cq_ci; + unsigned int cq_idx = cq_ci & q_mask; + unsigned int rq_ci = rxq->rq_ci; + unsigned int rq_idx = rq_ci & e_mask; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx]; + unsigned int elts_idx; + unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); + const vector unsigned char zero = (vector unsigned char){0}; + const vector unsigned char ones = vec_splat_u8(-1); + const vector unsigned char owner_check = + (vector unsigned char)(vector unsigned long){ + 0x0100000001000000LL, 0x0100000001000000LL}; + const vector unsigned char opcode_check = + (vector unsigned char)(vector unsigned long){ + 0xf0000000f0000000LL, 0xf0000000f0000000LL}; + const vector unsigned char format_check = + (vector unsigned char)(vector unsigned long){ + 0x0c0000000c000000LL, 0x0c0000000c000000LL}; + const vector unsigned char resp_err_check = + (vector unsigned char)(vector unsigned long){ + 0xe0000000e0000000LL, 0xe0000000e0000000LL}; +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const vector unsigned char len_shuf_mask = (vector unsigned char){ + 1, 0, 5, 4, + 9, 8, 13, 12, + -1, -1, -1, -1, + -1, -1, -1, -1}; +#endif + /* Mask to shuffle from extracted CQE to mbuf. */ + const vector unsigned char shuf_mask = (vector unsigned char){ + 5, 4, /* bswap16, pkt_len */ + -1, -1, /* zero out 2nd half of pkt_len */ + 5, 4, /* bswap16, data_len */ + 11, 10, /* bswap16, vlan+tci */ + 15, 14, 13, 12, /* bswap32, rss */ + 1, 2, 3, -1}; /* fdir.hi */ + /* Mask to blend from the last Qword to the first DQword. */ + /* Mask to blend from the last Qword to the first DQword. */ + const vector unsigned char blend_mask = (vector unsigned char){ + -1, 0, 0, 0, + 0, 0, 0, 0, + -1, -1, -1, -1, + -1, -1, -1, -1}; + const vector unsigned char crc_adj = + (vector unsigned char)(vector unsigned short){ + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0}; + const vector unsigned char flow_mark_adj = + (vector unsigned char)(vector unsigned int){ + 0, 0, 0, rxq->mark * (-1)}; + const vector unsigned short cqe_sel_mask1 = + (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0}; + const vector unsigned short cqe_sel_mask2 = + (vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0}; + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + if (rxq->consumed_strd == strd_n) { + /* Replace WQE only if the buffer is still in use. */ + if (rte_atomic16_read(&buf->refcnt) > 1) { + mprq_buf_replace(rxq, rq_ci & e_mask, strd_n); + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, + (void **)&rep)) + rxq->mprq_repl = rep; + } + /* Advance to the next WQE. */ + rxq->consumed_strd = 0; + ++rq_ci; + buf = (*rxq->mprq_bufs)[rq_ci & e_mask]; + rxq->rq_repl_thresh = 1; + } + if (rxq->rq_repl_thresh) + mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask); + + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + elts_idx = (rq_ci & e_mask) * strd_n + + (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP; + elts = &(*rxq->elts)[elts_idx]; + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf, + rq_ci, rxq->consumed_strd, true); + rxq->consumed_strd += rcvd_pkt; + rxq->rq_pi += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + pkts += rcvd_pkt; + } + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd); + if (!pkts_n) { + *no_cq = !rcvd_pkt; + return rcvd_pkt; + } + /* At this point, there shouldn't be any remaining packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + + /* + * A. load first Qword (8bytes) in one loop. + * B. copy 4 mbuf pointers from elts ring to returing pkts. + * C. load remaining CQE data and extract necessary fields. + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint8_t pkt_info; + * uint8_t flow_tag[3]; + * uint16_t byte_cnt; + * uint8_t rsvd4; + * uint8_t op_own; + * uint16_t hdr_type_etc; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * } c; + * D. fill in mbuf. + * E. get valid CQEs. + * F. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP]; + vector unsigned char cqe_tmp1, cqe_tmp2; + vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; + vector unsigned char op_own, op_own_tmp1, op_own_tmp2; + vector unsigned char opcode, owner_mask, invalid_mask; + vector unsigned char comp_mask; + vector unsigned char mask; +#ifdef MLX5_PMD_SOFT_COUNTERS + const vector unsigned char lower_half = { + 0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; + const vector unsigned char upper_half = { + 2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + const vector unsigned long shmax = {64, 64}; + vector unsigned char byte_cnt; + vector unsigned short left, right; + vector unsigned long lshift; + vector __attribute__((altivec(bool__))) + unsigned long shmask; +#endif + vector unsigned char mbp1, mbp2; + vector unsigned char p = + (vector unsigned char)(vector unsigned short){ + 0, 1, 2, 3, 0, 0, 0, 0}; + unsigned int p1, p2, p3; + + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); + } + + /* A.0 do not cross the end of CQ. */ + mask = (vector unsigned char)(vector unsigned long){ + (pkts_n - pos) * sizeof(uint16_t) * 8, 0}; + + { + vector unsigned long lshift; + vector __attribute__((altivec(bool__))) + unsigned long shmask; + const vector unsigned long shmax = {64, 64}; + + lshift = vec_splat((vector unsigned long)mask, 0); + shmask = vec_cmpgt(shmax, lshift); + mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)mask, shmask); + } + + p = (vector unsigned char) + vec_andc((vector unsigned long)p, + (vector unsigned long)mask); + + /* A.1 load cqes. */ + p3 = (unsigned int)((vector unsigned short)p)[3]; + cqes[3] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p3].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + p2 = (unsigned int)((vector unsigned short)p)[2]; + cqes[2] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p2].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + /* B.1 load mbuf pointers. */ + mbp1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&elts[pos + rxq->consumed_strd]); + mbp2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&elts[pos + + rxq->consumed_strd + 2]); + + /* A.1 load a block having op_own. */ + p1 = (unsigned int)((vector unsigned short)p)[1]; + cqes[1] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p1].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + cqes[0] = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos].sop_drop_qpn, 0LL}; + rte_compiler_barrier(); + + /* B.2 copy mbuf pointers. */ + *(vector unsigned char *)&pkts[pos] = mbp1; + *(vector unsigned char *)&pkts[pos + 2] = mbp2; + rte_cio_rmb(); + + /* C.1 load remaining CQE data and extract necessary fields. */ + cqe_tmp2 = *(vector unsigned char *) + &cq[pos + p3].pkt_info; + cqe_tmp1 = *(vector unsigned char *) + &cq[pos + p2].pkt_info; + cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask); + cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask); + cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos + p3].csum); + cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos + p2].csum); + cqes[3] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[3], + (vector unsigned short)cqe_tmp2, cqe_sel_mask1); + cqes[2] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[2], + (vector unsigned short)cqe_tmp1, cqe_sel_mask1); + cqe_tmp2 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p3].rsvd3[9], 0LL}; + cqe_tmp1 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p2].rsvd3[9], 0LL}; + cqes[3] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[3], + (vector unsigned short)cqe_tmp2, + (vector unsigned short)cqe_sel_mask2); + cqes[2] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[2], + (vector unsigned short)cqe_tmp1, + (vector unsigned short)cqe_sel_mask2); + + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask); + pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask); + + /* C.3 adjust CRC length. */ + pkt_mb3 = (vector unsigned char) + ((vector unsigned short)pkt_mb3 - + (vector unsigned short)crc_adj); + pkt_mb2 = (vector unsigned char) + ((vector unsigned short)pkt_mb2 - + (vector unsigned short)crc_adj); + + /* C.4 adjust flow mark. */ + pkt_mb3 = (vector unsigned char) + ((vector unsigned int)pkt_mb3 + + (vector unsigned int)flow_mark_adj); + pkt_mb2 = (vector unsigned char) + ((vector unsigned int)pkt_mb2 + + (vector unsigned int)flow_mark_adj); + + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + *(vector unsigned char *) + &pkts[pos + 3]->pkt_len = pkt_mb3; + *(vector unsigned char *) + &pkts[pos + 2]->pkt_len = pkt_mb2; + + /* E.1 extract op_own field. */ + op_own_tmp2 = (vector unsigned char) + vec_mergeh((vector unsigned int)cqes[2], + (vector unsigned int)cqes[3]); + + /* C.1 load remaining CQE data and extract necessary fields. */ + cqe_tmp2 = *(vector unsigned char *) + &cq[pos + p1].pkt_info; + cqe_tmp1 = *(vector unsigned char *) + &cq[pos].pkt_info; + cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask); + cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask); + cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos + p1].csum); + cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0, + (signed int const *)&cq[pos].csum); + cqes[1] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[1], + (vector unsigned short)cqe_tmp2, cqe_sel_mask1); + cqes[0] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[0], + (vector unsigned short)cqe_tmp1, cqe_sel_mask1); + cqe_tmp2 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos + p1].rsvd3[9], 0LL}; + cqe_tmp1 = (vector unsigned char)(vector unsigned long){ + *(__rte_aligned(8) unsigned long *) + &cq[pos].rsvd3[9], 0LL}; + cqes[1] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[1], + (vector unsigned short)cqe_tmp2, cqe_sel_mask2); + cqes[0] = (vector unsigned char) + vec_sel((vector unsigned short)cqes[0], + (vector unsigned short)cqe_tmp1, cqe_sel_mask2); + + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask); + pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask); + + /* C.3 adjust CRC length. */ + pkt_mb1 = (vector unsigned char) + ((vector unsigned short)pkt_mb1 - + (vector unsigned short)crc_adj); + pkt_mb0 = (vector unsigned char) + ((vector unsigned short)pkt_mb0 - + (vector unsigned short)crc_adj); + + /* C.4 adjust flow mark. */ + pkt_mb1 = (vector unsigned char) + ((vector unsigned int)pkt_mb1 + + (vector unsigned int)flow_mark_adj); + pkt_mb0 = (vector unsigned char) + ((vector unsigned int)pkt_mb0 + + (vector unsigned int)flow_mark_adj); + + /* E.1 extract op_own byte. */ + op_own_tmp1 = (vector unsigned char) + vec_mergeh((vector unsigned int)cqes[0], + (vector unsigned int)cqes[1]); + op_own = (vector unsigned char) + vec_mergel((vector unsigned long)op_own_tmp1, + (vector unsigned long)op_own_tmp2); + + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + *(vector unsigned char *) + &pkts[pos + 1]->pkt_len = pkt_mb1; + *(vector unsigned char *) + &pkts[pos]->pkt_len = pkt_mb0; + + /* E.2 flip owner bit to mark CQEs from last round. */ + owner_mask = (vector unsigned char) + vec_and((vector unsigned long)op_own, + (vector unsigned long)owner_check); + if (ownership) + owner_mask = (vector unsigned char) + vec_xor((vector unsigned long)owner_mask, + (vector unsigned long)owner_check); + owner_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)owner_mask, + (vector unsigned int)owner_check); + owner_mask = (vector unsigned char) + vec_packs((vector unsigned int)owner_mask, + (vector unsigned int)zero); + + /* E.3 get mask for invalidated CQEs. */ + opcode = (vector unsigned char) + vec_and((vector unsigned long)op_own, + (vector unsigned long)opcode_check); + invalid_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)opcode_check, + (vector unsigned int)opcode); + invalid_mask = (vector unsigned char) + vec_packs((vector unsigned int)invalid_mask, + (vector unsigned int)zero); + + /* E.4 mask out beyond boundary. */ + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)mask); + + /* E.5 merge invalid_mask with invalid owner. */ + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)owner_mask); + + /* F.1 find compressed CQE format. */ + comp_mask = (vector unsigned char) + vec_and((vector unsigned long)op_own, + (vector unsigned long)format_check); + comp_mask = (vector unsigned char) + vec_cmpeq((vector unsigned int)comp_mask, + (vector unsigned int)format_check); + comp_mask = (vector unsigned char) + vec_packs((vector unsigned int)comp_mask, + (vector unsigned int)zero); + + /* F.2 mask out invalid entries. */ + comp_mask = (vector unsigned char) + vec_andc((vector unsigned long)comp_mask, + (vector unsigned long)invalid_mask); + comp_idx = ((vector unsigned long)comp_mask)[0]; + + /* F.3 get the first compressed CQE. */ + comp_idx = comp_idx ? __builtin_ctzll(comp_idx) / + (sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP; + + /* E.6 mask out entries after the compressed CQE. */ + mask = (vector unsigned char)(vector unsigned long){ + (comp_idx * sizeof(uint16_t) * 8), 0}; + lshift = vec_splat((vector unsigned long)mask, 0); + shmask = vec_cmpgt(shmax, lshift); + mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)mask, shmask); + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)mask); + + /* E.7 count non-compressed valid CQEs. */ + n = ((vector unsigned long)invalid_mask)[0]; + n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + nocmp_n += n; + + /* D.2 get the final invalid mask. */ + mask = (vector unsigned char)(vector unsigned long){ + (n * sizeof(uint16_t) * 8), 0}; + lshift = vec_splat((vector unsigned long)mask, 0); + shmask = vec_cmpgt(shmax, lshift); + mask = (vector unsigned char) + vec_sl((vector unsigned long)ones, lshift); + mask = (vector unsigned char) + vec_sel((vector unsigned long)shmask, + (vector unsigned long)mask, shmask); + invalid_mask = (vector unsigned char) + vec_or((vector unsigned long)invalid_mask, + (vector unsigned long)mask); + + /* D.3 check error in opcode. */ + opcode = (vector unsigned char) + vec_cmpeq((vector unsigned int)resp_err_check, + (vector unsigned int)opcode); + opcode = (vector unsigned char) + vec_packs((vector unsigned int)opcode, + (vector unsigned int)zero); + opcode = (vector unsigned char) + vec_andc((vector unsigned long)opcode, + (vector unsigned long)invalid_mask); + + /* D.4 mark if any error is set */ + *err |= ((vector unsigned long)opcode)[0]; + + /* D.5 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); + if (rxq->hw_timestamp) { + pkts[pos]->timestamp = + rte_be_to_cpu_64(cq[pos].timestamp); + pkts[pos + 1]->timestamp = + rte_be_to_cpu_64(cq[pos + p1].timestamp); + pkts[pos + 2]->timestamp = + rte_be_to_cpu_64(cq[pos + p2].timestamp); + pkts[pos + 3]->timestamp = + rte_be_to_cpu_64(cq[pos + p3].timestamp); + } + if (rxq->dynf_meta) { + uint64_t flag = rxq->flow_meta_mask; + int32_t offs = rxq->flow_meta_offset; + uint32_t metadata; + + /* This code is subject for futher optimization. */ + metadata = cq[pos].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + metadata; + pkts[pos]->ol_flags |= metadata ? flag : 0ULL; + metadata = cq[pos + 1].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = + metadata; + pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL; + metadata = cq[pos + 2].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = + metadata; + pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL; + metadata = cq[pos + 3].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = + metadata; + pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = vec_perm(op_own, zero, len_shuf_mask); + byte_cnt = (vector unsigned char) + vec_andc((vector unsigned long)byte_cnt, + (vector unsigned long)invalid_mask); + left = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, lower_half); + right = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, upper_half); + byte_cnt = (vector unsigned char)vec_add(left, right); + left = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, lower_half); + right = vec_perm((vector unsigned short)byte_cnt, + (vector unsigned short)zero, upper_half); + byte_cnt = (vector unsigned char)vec_add(left, right); + rcvd_byte += ((vector unsigned long)byte_cnt)[0]; +#endif + + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { + *no_cq = true; + return rcvd_pkt; + } + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf, + rq_ci, rxq->consumed_strd, false); + rxq->cq_ci += nocmp_n; + rxq->consumed_strd += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = + rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf, + rq_ci, rxq->consumed_strd, true); + rxq->consumed_strd += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + rte_compiler_barrier(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } + *no_cq = !rcvd_pkt; + return rcvd_pkt; +} + #endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 555c342626..53c8ed8a9b 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -54,6 +54,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) pkts[pos] = elts[pos]; } +/** + * Store free buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + * @param buf + * MPRQ buffer to get packets from. + * @param buf rq_ci + * WQE index. + * @param strd_idx + * Stride number. + * @param comp + * Whether CQE is compressed or not. + */ +static inline void +rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t n, struct mlx5_mprq_buf *buf, + uint16_t rq_ci, uint16_t strd_idx, bool comp) +{ + const unsigned int strd_sz = 1 << rxq->strd_sz_n; + const unsigned int strd_n = 1 << rxq->strd_num_n; + const unsigned int strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + uint32_t offset; + void *addr; + int i = 0; + + if (comp) { + const uint16_t q_mask = (1 << rxq->cqe_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + uint64x2_t mbp; + + mbp = vld1q_u64((void *)&elts[pos + + rxq->consumed_strd]); + vst1q_u64((void *)&pkts[pos], mbp); + } + if (n & 1) + pkts[pos] = elts[pos]; + } + + for (i = 0; i < n; ++i) { + offset = (strd_idx + i) * strd_sz + strd_shift; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); + if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len || + rxq->mprq_repl == NULL) { + rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *), + addr, pkts[i]->pkt_len); + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = strd_sz; + void *buf_addr; + /* Increment the refcnt of the whole chunk. */ + rte_atomic16_add_return(&buf->refcnt, 1); + MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); + buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(buf_addr, buf); + shinfo = &buf->shinfos[strd_idx]; + rte_mbuf_ext_refcnt_set(shinfo, 1); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova, + buf_len, shinfo); + /* Set mbuf head-room. */ + SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM); + DATA_LEN(pkts[i]) = pkts[i]->pkt_len; + } + } +} + /** * Decompress a compressed completion and fill in mbufs in RX SW ring with data * extracted from the title completion descriptor. @@ -806,4 +895,492 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, return rcvd_pkt; } +static inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, + const unsigned int strd_n) +{ + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + void *addr; + + MLX5_ASSERT(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep, strd_n); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * @param[out] no_cq + * Pointer to a boolean. Set true if no new CQE seen. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, bool *no_cq) +{ + const unsigned int strd_n = 1 << rxq->strd_num_n; + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + const uint16_t e_n = 1 << rxq->elts_n; + const uint16_t e_mask = e_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_ci = rxq->cq_ci; + unsigned int cq_idx = cq_ci & q_mask; + unsigned int rq_ci = rxq->rq_ci; + unsigned int rq_idx = rq_ci & e_mask; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx]; + unsigned int elts_idx; + const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1))); + const uint16x4_t owner_check = vcreate_u16(0x0001000100010001); + const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0); + const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c); + const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; +#endif + /* Mask to generate 16B length vector. */ + const uint8x8_t len_shuf_m = { + 52, 53, /* 4th CQE */ + 36, 37, /* 3rd CQE */ + 20, 21, /* 2nd CQE */ + 4, 5 /* 1st CQE */ + }; + /* Mask to extract 16B data from a 64B CQE. */ + const uint8x16_t cqe_shuf_m = { + 28, 29, /* hdr_type_etc */ + 0, /* pkt_info */ + -1, /* null */ + 47, 46, /* byte_cnt, bswap16 */ + 31, 30, /* vlan_info, bswap16 */ + 15, 14, 13, 12, /* rx_hash_res, bswap32 */ + 57, 58, 59, /* flow_tag */ + 63 /* op_own */ + }; + /* Mask to generate 16B data for mbuf. */ + const uint8x16_t mb_shuf_m = { + 4, 5, -1, -1, /* pkt_len */ + 4, 5, /* data_len */ + 6, 7, /* vlan_tci */ + 8, 9, 10, 11, /* hash.rss */ + 12, 13, 14, -1 /* hash.fdir.hi */ + }; + /* Mask to generate 16B owner vector. */ + const uint8x8_t owner_shuf_m = { + 63, -1, /* 4th CQE */ + 47, -1, /* 3rd CQE */ + 31, -1, /* 2nd CQE */ + 15, -1 /* 1st CQE */ + }; + /* Mask to generate a vector having packet_type/ol_flags. */ + const uint8x16_t ptype_shuf_m = { + 48, 49, 50, -1, /* 4th CQE */ + 32, 33, 34, -1, /* 3rd CQE */ + 16, 17, 18, -1, /* 2nd CQE */ + 0, 1, 2, -1 /* 1st CQE */ + }; + /* Mask to generate a vector having flow tags. */ + const uint8x16_t ftag_shuf_m = { + 60, 61, 62, -1, /* 4th CQE */ + 44, 45, 46, -1, /* 3rd CQE */ + 28, 29, 30, -1, /* 2nd CQE */ + 12, 13, 14, -1 /* 1st CQE */ + }; + const uint16x8_t crc_adj = { + 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0 + }; + const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) }; + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + if (rxq->consumed_strd == strd_n) { + /* Replace WQE only if the buffer is still in use. */ + if (rte_atomic16_read(&buf->refcnt) > 1) { + mprq_buf_replace(rxq, rq_idx, strd_n); + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, + (void **)&rep)) + rxq->mprq_repl = rep; + } + /* Advance to the next WQE. */ + rxq->consumed_strd = 0; + ++rq_ci; + rq_idx = rq_ci & e_mask; + buf = (*rxq->mprq_bufs)[rq_idx]; + rxq->rq_repl_thresh = 1; + } + if (rxq->rq_repl_thresh) + mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_idx); + + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch_non_temporal(cq); + rte_prefetch_non_temporal(cq + 1); + rte_prefetch_non_temporal(cq + 2); + rte_prefetch_non_temporal(cq + 3); + elts_idx = (rq_ci & e_mask) * strd_n + + (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP; + elts = &(*rxq->elts)[elts_idx]; + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf, + rq_ci, rxq->consumed_strd, true); + rxq->consumed_strd += rcvd_pkt; + pkts += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + } + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd); + if (!pkts_n) { + *no_cq = !rcvd_pkt; + return rcvd_pkt; + } + /* At this point, there shouldn't be any remained packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + /* + * Note that vectors have reverse order - {v3, v2, v1, v0}, because + * there's no instruction to count trailing zeros. __builtin_clzl() is + * used instead. + * + * A. copy 4 mbuf pointers from elts ring to returing pkts. + * B. load 64B CQE and extract necessary fields + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint16_t hdr_type_etc; + * uint8_t pkt_info; + * uint8_t rsvd; + * uint16_t byte_cnt; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * uint8_t flow_tag[3]; + * uint8_t op_own; + * } c; + * C. fill in mbuf. + * D. get valid CQEs. + * E. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + uint16x4_t op_own; + uint16x4_t opcode, owner_mask, invalid_mask; + uint16x4_t comp_mask; + uint16x4_t mask; + uint16x4_t byte_cnt; + uint32x4_t ptype_info, flow_tag; + register uint64x2_t c0, c1, c2, c3; + uint8_t *p0, *p1, *p2, *p3; + uint8_t *e0 = (void *)&elts[pos + rxq->consumed_strd]->pkt_len; + uint8_t *e1 = (void *)&elts[pos + + rxq->consumed_strd + 1]->pkt_len; + uint8_t *e2 = (void *)&elts[pos + + rxq->consumed_strd + 2]->pkt_len; + uint8_t *e3 = (void *)&elts[pos + + rxq->consumed_strd + 3]->pkt_len; + void *elts_p = (void *)&elts[pos + rxq->consumed_strd]; + void *pkts_p = (void *)&pkts[pos]; + + /* A.0 do not cross the end of CQ. */ + mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL >> ((pkts_n - pos) * + sizeof(uint16_t) * 8) : 0); + p0 = (void *)&cq[pos].pkt_info; + p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe); + p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe); + p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe); + /* B.0 (CQE 3) load a block having op_own. */ + c3 = vld1q_u64((uint64_t *)(p3 + 48)); + /* B.0 (CQE 2) load a block having op_own. */ + c2 = vld1q_u64((uint64_t *)(p2 + 48)); + /* B.0 (CQE 1) load a block having op_own. */ + c1 = vld1q_u64((uint64_t *)(p1 + 48)); + /* B.0 (CQE 0) load a block having op_own. */ + c0 = vld1q_u64((uint64_t *)(p0 + 48)); + /* Synchronize for loading the rest of blocks. */ + rte_cio_rmb(); + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP; + rte_prefetch_non_temporal(&cq[next]); + rte_prefetch_non_temporal(&cq[next + 1]); + rte_prefetch_non_temporal(&cq[next + 2]); + rte_prefetch_non_temporal(&cq[next + 3]); + } + __asm__ volatile ( + /* B.1 (CQE 3) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t" + /* B.2 (CQE 3) move the block having op_own. */ + "mov v19.16b, %[c3].16b \n\t" + /* B.3 (CQE 3) extract 16B fields. */ + "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.1 (CQE 2) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t" + /* B.4 (CQE 3) adjust CRC length. */ + "sub v23.8h, v23.8h, %[crc_adj].8h \n\t" + /* C.1 (CQE 3) generate final structure for mbuf. */ + "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t" + /* B.2 (CQE 2) move the block having op_own. */ + "mov v19.16b, %[c2].16b \n\t" + /* B.3 (CQE 2) extract 16B fields. */ + "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.1 (CQE 1) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t" + /* B.4 (CQE 2) adjust CRC length. */ + "sub v22.8h, v22.8h, %[crc_adj].8h \n\t" + /* C.1 (CQE 2) generate final structure for mbuf. */ + "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t" + /* B.2 (CQE 1) move the block having op_own. */ + "mov v19.16b, %[c1].16b \n\t" + /* B.3 (CQE 1) extract 16B fields. */ + "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.1 (CQE 0) load the rest of blocks. */ + "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t" + /* B.4 (CQE 1) adjust CRC length. */ + "sub v21.8h, v21.8h, %[crc_adj].8h \n\t" + /* C.1 (CQE 1) generate final structure for mbuf. */ + "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t" + /* B.2 (CQE 0) move the block having op_own. */ + "mov v19.16b, %[c0].16b \n\t" + /* A.1 load mbuf pointers. */ + "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t" + /* B.3 (CQE 0) extract 16B fields. */ + "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" + /* B.4 (CQE 0) adjust CRC length. */ + "sub v20.8h, v20.8h, %[crc_adj].8h \n\t" + /* D.1 extract op_own byte. */ + "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t" + /* C.2 (CQE 3) adjust flow mark. */ + "add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v15.2d}, [%[e3]] \n\t" + /* C.2 (CQE 2) adjust flow mark. */ + "add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v14.2d}, [%[e2]] \n\t" + /* C.1 (CQE 0) generate final structure for mbuf. */ + "tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t" + /* C.2 (CQE 1) adjust flow mark. */ + "add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v13.2d}, [%[e1]] \n\t" +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Extract byte_cnt. */ + "tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t" +#endif + /* Extract ptype_info. */ + "tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t" + /* Extract flow_tag. */ + "tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t" + /* A.2 copy mbuf pointers. */ + "st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t" + /* C.2 (CQE 0) adjust flow mark. */ + "add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t" + /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */ + "st1 {v12.2d}, [%[e0]] \n\t" + :[op_own]"=&w"(op_own), + [byte_cnt]"=&w"(byte_cnt), + [ptype_info]"=&w"(ptype_info), + [flow_tag]"=&w"(flow_tag) + :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), + [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0), + [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0), + [elts_p]"r"(elts_p), + [pkts_p]"r"(pkts_p), + [cqe_shuf_m]"w"(cqe_shuf_m), + [mb_shuf_m]"w"(mb_shuf_m), + [owner_shuf_m]"w"(owner_shuf_m), + [len_shuf_m]"w"(len_shuf_m), + [ptype_shuf_m]"w"(ptype_shuf_m), + [ftag_shuf_m]"w"(ftag_shuf_m), + [crc_adj]"w"(crc_adj), + [flow_mark_adj]"w"(flow_mark_adj) + :"memory", + "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", + "v24", "v25"); + /* D.2 flip owner bit to mark CQEs from last round. */ + owner_mask = vand_u16(op_own, owner_check); + owner_mask = vceq_u16(owner_mask, ownership); + /* D.3 get mask for invalidated CQEs. */ + opcode = vand_u16(op_own, opcode_check); + invalid_mask = vceq_u16(opcode_check, opcode); + /* E.1 find compressed CQE format. */ + comp_mask = vand_u16(op_own, format_check); + comp_mask = vceq_u16(comp_mask, format_check); + /* D.4 mask out beyond boundary. */ + invalid_mask = vorr_u16(invalid_mask, mask); + /* D.5 merge invalid_mask with invalid owner. */ + invalid_mask = vorr_u16(invalid_mask, owner_mask); + /* E.2 mask out invalid entries. */ + comp_mask = vbic_u16(comp_mask, invalid_mask); + /* E.3 get the first compressed CQE. */ + comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16( + comp_mask), 0)) / + (sizeof(uint16_t) * 8); + /* D.6 mask out entries after the compressed CQE. */ + mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL >> (comp_idx * sizeof(uint16_t) * 8) : + 0); + invalid_mask = vorr_u16(invalid_mask, mask); + /* D.7 count non-compressed valid CQEs. */ + n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16( + invalid_mask), 0)) / (sizeof(uint16_t) * 8); + nocmp_n += n; + /* D.2 get the final invalid mask. */ + mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ? + -1UL >> (n * sizeof(uint16_t) * 8) : 0); + invalid_mask = vorr_u16(invalid_mask, mask); + /* D.3 check error in opcode. */ + opcode = vceq_u16(resp_err_check, opcode); + opcode = vbic_u16(opcode, invalid_mask); + /* D.4 mark if any error is set */ + *err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0); + /* C.4 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag, + opcode, &elts[pos]); + if (rxq->hw_timestamp) { + elts[pos]->timestamp = + rte_be_to_cpu_64( + container_of(p0, struct mlx5_cqe, + pkt_info)->timestamp); + elts[pos + 1]->timestamp = + rte_be_to_cpu_64( + container_of(p1, struct mlx5_cqe, + pkt_info)->timestamp); + elts[pos + 2]->timestamp = + rte_be_to_cpu_64( + container_of(p2, struct mlx5_cqe, + pkt_info)->timestamp); + elts[pos + 3]->timestamp = + rte_be_to_cpu_64( + container_of(p3, struct mlx5_cqe, + pkt_info)->timestamp); + } + if (!!rxq->flow_meta_mask) { + /* This code is subject for futher optimization. */ + int32_t offs = rxq->flow_meta_offset; + + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p0, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p1, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p2, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p3, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) + elts[pos]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) + elts[pos + 1]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) + elts[pos + 2]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) + elts[pos + 3]->ol_flags |= rxq->flow_meta_mask; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = vbic_u16(byte_cnt, invalid_mask); + rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0); +#endif + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { + *no_cq = true; + return rcvd_pkt; + } + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf, + rq_ci, rxq->consumed_strd, false); + rxq->cq_ci += nocmp_n; + rxq->consumed_strd += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf, + rq_ci, rxq->consumed_strd, true); + rxq->consumed_strd += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + rte_cio_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } + *no_cq = !rcvd_pkt; + return rcvd_pkt; +} + #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index 34e3397115..4054614674 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -56,6 +56,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) pkts[pos] = elts[pos]; } +/** + * Copy or attach MPRQ buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + * @param buf + * MPRQ buffer to get packets from. + * @param buf rq_ci + * WQE index. + * @param strd_idx + * Stride number. + * @param comp + * Whether CQE is compressed or not. + */ +static inline void +rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t n, struct mlx5_mprq_buf *buf, + uint16_t rq_ci, uint16_t strd_idx, bool comp) +{ + const unsigned int strd_sz = 1 << rxq->strd_sz_n; + const unsigned int strd_n = 1 << rxq->strd_num_n; + const unsigned int strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + uint32_t offset; + void *addr; + int i = 0; + + if (comp) { + const uint16_t q_mask = (1 << rxq->cqe_n) - 1; + struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask]; + unsigned int pos; + uint16_t p = n & -2; + + for (pos = 0; pos < p; pos += 2) { + __m128i mbp; + + mbp = _mm_loadu_si128((__m128i *)&elts[pos + + rxq->consumed_strd]); + _mm_storeu_si128((__m128i *)&pkts[pos], mbp); + } + if (n & 1) + pkts[pos] = elts[pos]; + } + + for (i = 0; i < n; ++i) { + offset = (strd_idx + i) * strd_sz + strd_shift; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); + if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len || + rxq->mprq_repl == NULL) { + rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *), + addr, pkts[i]->pkt_len); + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = strd_sz; + void *buf_addr; + /* Increment the refcnt of the whole chunk. */ + rte_atomic16_add_return(&buf->refcnt, 1); + MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); + buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(buf_addr, buf); + shinfo = &buf->shinfos[strd_idx]; + rte_mbuf_ext_refcnt_set(shinfo, 1); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova, + buf_len, shinfo); + /* Set mbuf head-room. */ + SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM); + DATA_LEN(pkts[i]) = pkts[i]->pkt_len; + } + } +} + /** * Decompress a compressed completion and fill in mbufs in RX SW ring with data * extracted from the title completion descriptor. @@ -753,4 +842,435 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, return rcvd_pkt; } +static inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, + const unsigned int strd_n) +{ + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + void *addr; + + MLX5_ASSERT(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep, strd_n); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * @param[out] no_cq + * Pointer to a boolean. Set true if no new CQE seen. + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, bool *no_cq) +{ + const unsigned int strd_n = 1 << rxq->strd_num_n; + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + const uint16_t e_n = 1 << rxq->elts_n; + const uint16_t e_mask = e_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + unsigned int pos; + uint64_t n; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_ci = rxq->cq_ci; + unsigned int cq_idx = cq_ci & q_mask; + unsigned int rq_ci = rxq->rq_ci; + unsigned int rq_idx = rq_ci & e_mask; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx]; + unsigned int elts_idx; + unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); + const __m128i owner_check = + _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); + const __m128i opcode_check = + _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); + const __m128i format_check = + _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); + const __m128i resp_err_check = + _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); +#ifdef MLX5_PMD_SOFT_COUNTERS + uint32_t rcvd_byte = 0; + /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ + const __m128i len_shuf_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 12, 13, 8, 9, + 4, 5, 0, 1); +#endif + /* Mask to shuffle from extracted CQE to mbuf. */ + const __m128i shuf_mask = + _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ + 12, 13, 14, 15, /* rss, bswap32 */ + 10, 11, /* vlan_tci, bswap16 */ + 4, 5, /* data_len, bswap16 */ + -1, -1, /* zero out 2nd half of pkt_len */ + 4, 5 /* pkt_len, bswap16 */); + /* Mask to blend from the last Qword to the first DQword. */ + const __m128i blend_mask = + _mm_set_epi8(-1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, -1); + const __m128i zero = _mm_setzero_si128(); + const __m128i ones = _mm_cmpeq_epi32(zero, zero); + const __m128i crc_adj = + _mm_set_epi16(0, 0, 0, 0, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, + 0, + rxq->crc_present * RTE_ETHER_CRC_LEN); + const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + + if (rxq->consumed_strd == strd_n) { + /* Replace WQE only if the buffer is still in use. */ + if (rte_atomic16_read(&buf->refcnt) > 1) { + mprq_buf_replace(rxq, rq_ci & e_mask, strd_n); + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, + (void **)&rep)) + rxq->mprq_repl = rep; + } + /* Advance to the next WQE. */ + rxq->consumed_strd = 0; + ++rq_ci; + buf = (*rxq->mprq_bufs)[rq_ci & e_mask]; + rxq->rq_repl_thresh = 1; + } + if (rxq->rq_repl_thresh) + mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask); + + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + elts_idx = (rq_ci & e_mask) * strd_n + + (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP; + elts = &(*rxq->elts)[elts_idx]; + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf, + rq_ci, rxq->consumed_strd, true); + rxq->consumed_strd += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + pkts += rcvd_pkt; + } + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd); + if (!pkts_n) { + *no_cq = !rcvd_pkt; + return rcvd_pkt; + } + /* At this point, there shouldn't be any remained packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + /* + * A. load first Qword (8bytes) in one loop. + * B. copy 4 mbuf pointers from elts ring to returing pkts. + * C. load remained CQE data and extract necessary fields. + * Final 16bytes cqes[] extracted from original 64bytes CQE has the + * following structure: + * struct { + * uint8_t pkt_info; + * uint8_t flow_tag[3]; + * uint16_t byte_cnt; + * uint8_t rsvd4; + * uint8_t op_own; + * uint16_t hdr_type_etc; + * uint16_t vlan_info; + * uint32_t rx_has_res; + * } c; + * D. fill in mbuf. + * E. get valid CQEs. + * F. find compressed CQE. + */ + for (pos = 0; + pos < pkts_n; + pos += MLX5_VPMD_DESCS_PER_LOOP) { + __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; + __m128i cqe_tmp1, cqe_tmp2; + __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; + __m128i op_own, op_own_tmp1, op_own_tmp2; + __m128i opcode, owner_mask, invalid_mask; + __m128i comp_mask; + __m128i mask; +#ifdef MLX5_PMD_SOFT_COUNTERS + __m128i byte_cnt; +#endif + __m128i mbp1, mbp2; + __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); + unsigned int p1, p2, p3; + + /* Prefetch next 4 CQEs. */ + if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); + rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); + } + /* A.0 do not cross the end of CQ. */ + mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + p = _mm_andnot_si128(mask, p); + /* A.1 load cqes. */ + p3 = _mm_extract_epi16(p, 3); + cqes[3] = _mm_loadl_epi64((__m128i *) + &cq[pos + p3].sop_drop_qpn); + rte_compiler_barrier(); + p2 = _mm_extract_epi16(p, 2); + cqes[2] = _mm_loadl_epi64((__m128i *) + &cq[pos + p2].sop_drop_qpn); + rte_compiler_barrier(); + /* B.1 load mbuf pointers. */ + mbp1 = _mm_loadu_si128((__m128i *)&elts[pos + + rxq->consumed_strd]); + mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + + rxq->consumed_strd + 2]); + /* A.1 load a block having op_own. */ + p1 = _mm_extract_epi16(p, 1); + cqes[1] = _mm_loadl_epi64((__m128i *) + &cq[pos + p1].sop_drop_qpn); + rte_compiler_barrier(); + cqes[0] = _mm_loadl_epi64((__m128i *) + &cq[pos].sop_drop_qpn); + /* B.2 copy mbuf pointers. */ + _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); + _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); + rte_cio_rmb(); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); + cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); + cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]); + cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); + cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); + pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); + pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); + pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); + _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); + /* E.1 extract op_own field. */ + op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); + /* C.1 load remained CQE data and extract necessary fields. */ + cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); + cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); + cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); + cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); + cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum); + cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); + cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]); + cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]); + cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); + cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); + /* C.2 generate final structure for mbuf with swapping bytes. */ + pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); + pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); + /* C.3 adjust CRC length. */ + pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); + pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); + /* C.4 adjust flow mark. */ + pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); + pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); + /* E.1 extract op_own byte. */ + op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); + op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); + /* D.1 fill in mbuf - rx_descriptor_fields1. */ + _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); + _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); + /* E.2 flip owner bit to mark CQEs from last round. */ + owner_mask = _mm_and_si128(op_own, owner_check); + if (ownership) + owner_mask = _mm_xor_si128(owner_mask, owner_check); + owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); + owner_mask = _mm_packs_epi32(owner_mask, zero); + /* E.3 get mask for invalidated CQEs. */ + opcode = _mm_and_si128(op_own, opcode_check); + invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); + invalid_mask = _mm_packs_epi32(invalid_mask, zero); + /* E.4 mask out beyond boundary. */ + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.5 merge invalid_mask with invalid owner. */ + invalid_mask = _mm_or_si128(invalid_mask, owner_mask); + /* F.1 find compressed CQE format. */ + comp_mask = _mm_and_si128(op_own, format_check); + comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); + comp_mask = _mm_packs_epi32(comp_mask, zero); + /* F.2 mask out invalid entries. */ + comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); + comp_idx = _mm_cvtsi128_si64(comp_mask); + /* F.3 get the first compressed CQE. */ + comp_idx = comp_idx ? + __builtin_ctzll(comp_idx) / + (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + /* E.6 mask out entries after the compressed CQE. */ + mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* E.7 count non-compressed valid CQEs. */ + n = _mm_cvtsi128_si64(invalid_mask); + n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : + MLX5_VPMD_DESCS_PER_LOOP; + nocmp_n += n; + /* D.2 get the final invalid mask. */ + mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); + mask = _mm_sll_epi64(ones, mask); + invalid_mask = _mm_or_si128(invalid_mask, mask); + /* D.3 check error in opcode. */ + opcode = _mm_cmpeq_epi32(resp_err_check, opcode); + opcode = _mm_packs_epi32(opcode, zero); + opcode = _mm_andnot_si128(invalid_mask, opcode); + /* D.4 mark if any error is set */ + *err |= _mm_cvtsi128_si64(opcode); + /* D.5 fill in mbuf - rearm_data and packet_type. */ + rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); + if (rxq->hw_timestamp) { + pkts[pos]->timestamp = + rte_be_to_cpu_64(cq[pos].timestamp); + pkts[pos + 1]->timestamp = + rte_be_to_cpu_64(cq[pos + p1].timestamp); + pkts[pos + 2]->timestamp = + rte_be_to_cpu_64(cq[pos + p2].timestamp); + pkts[pos + 3]->timestamp = + rte_be_to_cpu_64(cq[pos + p3].timestamp); + } + if (rxq->dynf_meta) { + /* This code is subject for futher optimization. */ + int32_t offs = rxq->flow_meta_offset; + + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + cq[pos].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = + cq[pos + p1].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = + cq[pos + p2].flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = + cq[pos + p3].flow_table_metadata; + if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) + pkts[pos]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) + pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) + pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) + pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask; + } +#ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ + byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); + byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); + byte_cnt = _mm_hadd_epi16(byte_cnt, zero); + rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); +#endif + /* + * Break the loop unless more valid CQE is expected, or if + * there's a compressed CQE. + */ + if (n != MLX5_VPMD_DESCS_PER_LOOP) + break; + } + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { + *no_cq = true; + return rcvd_pkt; + } + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf, + rq_ci, rxq->consumed_strd, false); + rxq->cq_ci += nocmp_n; + rxq->consumed_strd += nocmp_n; + rcvd_pkt += nocmp_n; +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets += nocmp_n; + rxq->stats.ibytes += rcvd_byte; +#endif + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n + rxq->consumed_strd]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf, + rq_ci, rxq->consumed_strd, true); + rxq->consumed_strd += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + + rte_compiler_barrier(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_cio_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } + *no_cq = !rcvd_pkt; + return rcvd_pkt; +} + #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */ -- 2.24.1