From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id 13335A04DD; Wed, 21 Oct 2020 22:31:33 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 63F05A8EC; Wed, 21 Oct 2020 22:30:57 +0200 (CEST) Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id F1E14A575 for ; Wed, 21 Oct 2020 22:30:53 +0200 (CEST) Received: from Internal Mail-Server by MTLPINE1 (envelope-from akozyrev@nvidia.com) with SMTP; 21 Oct 2020 23:30:47 +0300 Received: from nvidia.com (pegasus02.mtr.labs.mlnx [10.210.16.122]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id 09LKUl32019906; Wed, 21 Oct 2020 23:30:47 +0300 From: Alexander Kozyrev To: dev@dpdk.org Cc: rasland@nvidia.com, matan@nvidia.com, viacheslavo@nvidia.com Date: Wed, 21 Oct 2020 20:30:30 +0000 Message-Id: <20201021203030.19042-3-akozyrev@nvidia.com> X-Mailer: git-send-email 2.24.1 In-Reply-To: <20201021203030.19042-1-akozyrev@nvidia.com> References: <20200719041142.14485-1-akozyrev@mellanox.com> <20201021203030.19042-1-akozyrev@nvidia.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [dpdk-dev] [PATCH v2 2/2] net/mlx5: implement vectorized MPRQ burst X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" MPRQ (Multi-Packet Rx Queue) processes one packet at a time using simple scalar instructions. MPRQ works by posting a single large buffer (consisted of multiple fixed-size strides) in order to receive multiple packets at once on this buffer. A Rx packet is then copied to a user-provided mbuf or PMD attaches the Rx packet to the mbuf by the pointer to an external buffer. There is an opportunity to speed up the packet receiving by processing 4 packets simultaneously using SIMD (single instruction, multiple data) extensions. Allocate mbufs in batches for every MPRQ buffer and process the packets in groups of 4 until all the strides are exhausted. Then switch to another MPRQ buffer and repeat the process over again. The vectorized MPRQ burst routine is engaged automatically in case the mprq_en=1 devarg is specified and the vectorization is not disabled explicitly by providing rx_vec_en=0 devarg. There is a limitation: LRO is not supported and scalar MPRQ is selected if it is on. Signed-off-by: Alexander Kozyrev Acked-by: Slava Ovsiienko --- drivers/net/mlx5/mlx5_devx.c | 15 +- drivers/net/mlx5/mlx5_ethdev.c | 20 +- drivers/net/mlx5/mlx5_rxq.c | 96 ++++++---- drivers/net/mlx5/mlx5_rxtx.c | 237 ++++++----------------- drivers/net/mlx5/mlx5_rxtx.h | 200 +++++++++++++++++++- drivers/net/mlx5/mlx5_rxtx_vec.c | 312 ++++++++++++++++++++++++++++++- drivers/net/mlx5/mlx5_rxtx_vec.h | 56 ------ 7 files changed, 644 insertions(+), 292 deletions(-) diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c index 11bda32557..0c99fe7519 100644 --- a/drivers/net/mlx5/mlx5_devx.c +++ b/drivers/net/mlx5/mlx5_devx.c @@ -437,10 +437,17 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx) if (priv->config.cqe_comp && !rxq_data->hw_timestamp && !rxq_data->lro) { cq_attr.cqe_comp_en = 1u; - cq_attr.mini_cqe_res_format = - mlx5_rxq_mprq_enabled(rxq_data) ? - MLX5_CQE_RESP_FORMAT_CSUM_STRIDX : - MLX5_CQE_RESP_FORMAT_HASH; + /* + * Select CSUM miniCQE format only for non-vectorized MPRQ + * Rx burst, use HASH miniCQE format for everything else. + */ + if (mlx5_rxq_check_vec_support(rxq_data) < 0 && + mlx5_rxq_mprq_enabled(rxq_data)) + cq_attr.mini_cqe_res_format = + MLX5_CQE_RESP_FORMAT_CSUM_STRIDX; + else + cq_attr.mini_cqe_res_format = + MLX5_CQE_RESP_FORMAT_HASH; /* * For vectorized Rx, it must not be doubled in order to * make cq_ci and rq_ci aligned. diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 7631f644b2..c70cd301b5 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev) if (dev->rx_pkt_burst == mlx5_rx_burst || dev->rx_pkt_burst == mlx5_rx_burst_mprq || - dev->rx_pkt_burst == mlx5_rx_burst_vec) + dev->rx_pkt_burst == mlx5_rx_burst_vec || + dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec) return ptypes; return NULL; } @@ -480,11 +481,22 @@ mlx5_select_rx_function(struct rte_eth_dev *dev) MLX5_ASSERT(dev != NULL); if (mlx5_check_vec_rx_support(dev) > 0) { - rx_pkt_burst = mlx5_rx_burst_vec; - DRV_LOG(DEBUG, "port %u selected Rx vectorized function", - dev->data->port_id); + if (mlx5_mprq_enabled(dev)) { + rx_pkt_burst = mlx5_rx_burst_mprq_vec; + DRV_LOG(DEBUG, "port %u selected vectorized" + " MPRQ Rx function", dev->data->port_id); + } else { + rx_pkt_burst = mlx5_rx_burst_vec; + DRV_LOG(DEBUG, "port %u selected vectorized" + " SPRQ Rx function", dev->data->port_id); + } } else if (mlx5_mprq_enabled(dev)) { rx_pkt_burst = mlx5_rx_burst_mprq; + DRV_LOG(DEBUG, "port %u selected MPRQ Rx function", + dev->data->port_id); + } else { + DRV_LOG(DEBUG, "port %u selected SPRQ Rx function", + dev->data->port_id); } return rx_pkt_burst; } diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index e1783ba397..ca1625eac6 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -173,7 +173,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) rxq->mprq_repl = buf; } DRV_LOG(DEBUG, - "port %u Rx queue %u allocated and configured %u segments", + "port %u MPRQ queue %u allocated and configured %u segments", rxq->port_id, rxq->idx, wqe_n); return 0; error: @@ -185,7 +185,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) (*rxq->mprq_bufs)[i]); (*rxq->mprq_bufs)[i] = NULL; } - DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + DRV_LOG(DEBUG, "port %u MPRQ queue %u failed, freed everything", rxq->port_id, rxq->idx); rte_errno = err; /* Restore rte_errno. */ return -rte_errno; @@ -204,7 +204,9 @@ static int rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n; - unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n; + unsigned int elts_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + (1 << rxq_ctrl->rxq.elts_n) * (1 << rxq_ctrl->rxq.strd_num_n) : + (1 << rxq_ctrl->rxq.elts_n); unsigned int i; int err; @@ -262,7 +264,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf; } DRV_LOG(DEBUG, - "port %u Rx queue %u allocated and configured %u segments" + "port %u SPRQ queue %u allocated and configured %u segments" " (max %u packets)", PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n)); @@ -275,7 +277,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]); (*rxq_ctrl->rxq.elts)[i] = NULL; } - DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything", + DRV_LOG(DEBUG, "port %u SPRQ queue %u failed, freed everything", PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx); rte_errno = err; /* Restore rte_errno. */ return -rte_errno; @@ -293,8 +295,15 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl) { - return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? - rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl); + int ret = 0; + + /** + * For MPRQ we need to allocate both MPRQ buffers + * for WQEs and simple mbufs for vector processing. + */ + if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) + ret = rxq_alloc_elts_mprq(rxq_ctrl); + return (ret || rxq_alloc_elts_sprq(rxq_ctrl)); } /** @@ -309,11 +318,10 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl) struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; uint16_t i; - DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs", - rxq->port_id, rxq->idx); + DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing %d WRs", + rxq->port_id, rxq->idx, (1u << rxq->elts_n)); if (rxq->mprq_bufs == NULL) return; - MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0); for (i = 0; (i != (1u << rxq->elts_n)); ++i) { if ((*rxq->mprq_bufs)[i] != NULL) mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]); @@ -335,25 +343,27 @@ static void rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) { struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq; - const uint16_t q_n = (1 << rxq->elts_n); + const uint16_t q_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ? + (1 << rxq->elts_n) * (1 << rxq->strd_num_n) : + (1 << rxq->elts_n); const uint16_t q_mask = q_n - 1; uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi); uint16_t i; - DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs", - PORT_ID(rxq_ctrl->priv), rxq->idx); + DRV_LOG(DEBUG, "port %u Rx queue %u freeing %d WRs", + PORT_ID(rxq_ctrl->priv), rxq->idx, q_n); if (rxq->elts == NULL) return; /** - * Some mbuf in the Ring belongs to the application. They cannot be - * freed. + * Some mbuf in the Ring belongs to the application. + * They cannot be freed. */ if (mlx5_rxq_check_vec_support(rxq) > 0) { for (i = 0; i < used; ++i) (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL; rxq->rq_pi = rxq->rq_ci; } - for (i = 0; (i != (1u << rxq->elts_n)); ++i) { + for (i = 0; i != q_n; ++i) { if ((*rxq->elts)[i] != NULL) rte_pktmbuf_free_seg((*rxq->elts)[i]); (*rxq->elts)[i] = NULL; @@ -369,10 +379,13 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl) static void rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl) { + /* + * For MPRQ we need to allocate both MPRQ buffers + * for WQEs and simple mbufs for vector processing. + */ if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq)) rxq_free_elts_mprq(rxq_ctrl); - else - rxq_free_elts_sprq(rxq_ctrl); + rxq_free_elts_sprq(rxq_ctrl); } /** @@ -1334,20 +1347,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_rxq_ctrl *tmpl; unsigned int mb_len = rte_pktmbuf_data_room_size(mp); - unsigned int mprq_stride_nums; - unsigned int mprq_stride_size; - unsigned int mprq_stride_cap; struct mlx5_dev_config *config = &priv->config; - /* - * Always allocate extra slots, even if eventually - * the vector Rx will not be used. - */ - uint16_t desc_n = - desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP; uint64_t offloads = conf->offloads | dev->data->dev_conf.rxmode.offloads; unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO); - const int mprq_en = mlx5_check_mprq_support(dev) > 0; unsigned int max_rx_pkt_len = lro_on_queue ? dev->data->dev_conf.rxmode.max_lro_pkt_size : dev->data->dev_conf.rxmode.max_rx_pkt_len; @@ -1355,6 +1358,21 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, RTE_PKTMBUF_HEADROOM; unsigned int max_lro_size = 0; unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM; + const int mprq_en = mlx5_check_mprq_support(dev) > 0; + unsigned int mprq_stride_nums = config->mprq.stride_num_n ? + config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N; + unsigned int mprq_stride_size = non_scatter_min_mbuf_size <= + (1U << config->mprq.max_stride_size_n) ? + log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N; + unsigned int mprq_stride_cap = (config->mprq.stride_num_n ? + (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) * + (config->mprq.stride_size_n ? + (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size)); + /* + * Always allocate extra slots, even if eventually + * the vector Rx will not be used. + */ + uint16_t desc_n = desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP; if (non_scatter_min_mbuf_size > mb_len && !(offloads & DEV_RX_OFFLOAD_SCATTER)) { @@ -1366,8 +1384,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, rte_errno = ENOSPC; return NULL; } - tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) + - desc_n * sizeof(struct rte_mbuf *), 0, socket); + tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, + sizeof(*tmpl) + desc_n * sizeof(struct rte_mbuf *) + + (desc >> mprq_stride_nums) * sizeof(struct mlx5_mprq_buf *), + 0, socket); + if (!tmpl) { rte_errno = ENOMEM; return NULL; @@ -1381,15 +1402,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, tmpl->socket = socket; if (dev->data->dev_conf.intr_conf.rxq) tmpl->irq = 1; - mprq_stride_nums = config->mprq.stride_num_n ? - config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N; - mprq_stride_size = non_scatter_min_mbuf_size <= - (1U << config->mprq.max_stride_size_n) ? - log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N; - mprq_stride_cap = (config->mprq.stride_num_n ? - (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) * - (config->mprq.stride_size_n ? - (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size)); /* * This Rx queue can be configured as a Multi-Packet RQ if all of the * following conditions are met: @@ -1535,9 +1547,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, tmpl->rxq.mp = mp; tmpl->rxq.elts_n = log2above(desc); tmpl->rxq.rq_repl_thresh = - MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n); + MLX5_VPMD_RXQ_RPLNSH_THRESH(desc_n); tmpl->rxq.elts = - (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1); + (struct rte_mbuf *(*)[desc_n])(tmpl + 1); + tmpl->rxq.mprq_bufs = + (struct mlx5_mprq_buf *(*)[desc])(*tmpl->rxq.elts + desc_n); #ifndef RTE_ARCH_64 tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq; #endif diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index b530ff421f..dbb427b5a8 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -19,12 +19,12 @@ #include #include +#include "mlx5_autoconf.h" #include "mlx5_defs.h" #include "mlx5.h" #include "mlx5_mr.h" #include "mlx5_utils.h" #include "mlx5_rxtx.h" -#include "mlx5_autoconf.h" /* TX burst subroutines return codes. */ enum mlx5_txcmp_code { @@ -93,10 +93,6 @@ static __rte_always_inline void rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); -static __rte_always_inline void -mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, - const unsigned int strd_n); - static int mlx5_queue_state_modify(struct rte_eth_dev *dev, struct mlx5_mp_arg_queue_state_modify *sm); @@ -584,7 +580,14 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, struct rte_eth_burst_mode *mode) { eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq; + rxq = (*priv->rxqs)[rx_queue_id]; + if (!rxq) { + rte_errno = EINVAL; + return -rte_errno; + } if (pkt_burst == mlx5_rx_burst) { snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); } else if (pkt_burst == mlx5_rx_burst_mprq) { @@ -598,6 +601,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); #else return -EINVAL; +#endif + } else if (pkt_burst == mlx5_rx_burst_mprq_vec) { +#if defined RTE_ARCH_X86_64 + snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE"); +#elif defined RTE_ARCH_ARM64 + snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon"); +#elif defined RTE_ARCH_PPC_64 + snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec"); +#else + return -EINVAL; #endif } else { return -EINVAL; @@ -866,6 +879,8 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) rxq->zip = (struct rxq_zip){ .ai = 0, }; + rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ? + (wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0; /* Update doorbell counter. */ rxq->rq_ci = wqe_n >> rxq->sges_n; rte_io_wmb(); @@ -969,7 +984,8 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) { const uint16_t cqe_n = 1 << rxq->cqe_n; const uint16_t cqe_mask = cqe_n - 1; - const unsigned int wqe_n = 1 << rxq->elts_n; + const uint16_t wqe_n = 1 << rxq->elts_n; + const uint16_t strd_n = 1 << rxq->strd_num_n; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); union { @@ -1033,21 +1049,27 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) &sm)) return -1; if (vec) { - const uint16_t q_mask = wqe_n - 1; - uint16_t elt_idx; + const uint32_t elts_n = + mlx5_rxq_mprq_enabled(rxq) ? + wqe_n * strd_n : wqe_n; + const uint32_t e_mask = elts_n - 1; + uint32_t elts_ci = + mlx5_rxq_mprq_enabled(rxq) ? + rxq->elts_ci : rxq->rq_ci; + uint32_t elt_idx; struct rte_mbuf **elt; int i; - unsigned int n = wqe_n - (rxq->rq_ci - + unsigned int n = elts_n - (elts_ci - rxq->rq_pi); for (i = 0; i < (int)n; ++i) { - elt_idx = (rxq->rq_ci + i) & q_mask; + elt_idx = (elts_ci + i) & e_mask; elt = &(*rxq->elts)[elt_idx]; *elt = rte_mbuf_raw_alloc(rxq->mp); if (!*elt) { for (i--; i >= 0; --i) { - elt_idx = (rxq->rq_ci + - i) & q_mask; + elt_idx = (elts_ci + + i) & elts_n; elt = &(*rxq->elts) [elt_idx]; rte_pktmbuf_free_seg @@ -1056,7 +1078,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) return -1; } } - for (i = 0; i < (int)wqe_n; ++i) { + for (i = 0; i < (int)elts_n; ++i) { elt = &(*rxq->elts)[i]; DATA_LEN(*elt) = (uint16_t)((*elt)->buf_len - @@ -1064,7 +1086,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) } /* Padding with a fake mbuf for vec Rx. */ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) - (*rxq->elts)[wqe_n + i] = + (*rxq->elts)[elts_n + i] = &rxq->fake_mbuf; } mlx5_rxq_initialize(rxq); @@ -1545,31 +1567,6 @@ mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) mlx5_mprq_buf_free_cb(NULL, buf); } -static inline void -mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, - const unsigned int strd_n) -{ - struct mlx5_mprq_buf *rep = rxq->mprq_repl; - volatile struct mlx5_wqe_data_seg *wqe = - &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; - void *addr; - - MLX5_ASSERT(rep != NULL); - /* Replace MPRQ buf. */ - (*rxq->mprq_bufs)[rq_idx] = rep; - /* Replace WQE. */ - addr = mlx5_mprq_buf_addr(rep, strd_n); - wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); - /* If there's only one MR, no need to replace LKey in WQE. */ - if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) - wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); - /* Stash a mbuf for next replacement. */ - if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) - rxq->mprq_repl = rep; - else - rxq->mprq_repl = NULL; -} - /** * DPDK callback for RX with Multi-Packet RQ support. * @@ -1587,12 +1584,9 @@ uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct mlx5_rxq_data *rxq = dpdk_rxq; - const unsigned int strd_n = 1 << rxq->strd_num_n; - const unsigned int strd_sz = 1 << rxq->strd_sz_n; - const unsigned int strd_shift = - MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; - const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; - const unsigned int wq_mask = (1 << rxq->elts_n) - 1; + const uint32_t strd_n = 1 << rxq->strd_num_n; + const uint32_t cq_mask = (1 << rxq->cqe_n) - 1; + const uint32_t wq_mask = (1 << rxq->elts_n) - 1; volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; unsigned int i = 0; uint32_t rq_ci = rxq->rq_ci; @@ -1601,37 +1595,18 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) while (i < pkts_n) { struct rte_mbuf *pkt; - void *addr; int ret; uint32_t len; uint16_t strd_cnt; uint16_t strd_idx; - uint32_t offset; uint32_t byte_cnt; - int32_t hdrm_overlap; volatile struct mlx5_mini_cqe8 *mcqe = NULL; uint32_t rss_hash_res = 0; + enum mlx5_rqx_code rxq_code; if (consumed_strd == strd_n) { - /* Replace WQE only if the buffer is still in use. */ - if (__atomic_load_n(&buf->refcnt, - __ATOMIC_RELAXED) > 1) { - mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); - /* Release the old buffer. */ - mlx5_mprq_buf_free(buf); - } else if (unlikely(rxq->mprq_repl == NULL)) { - struct mlx5_mprq_buf *rep; - - /* - * Currently, the MPRQ mempool is out of buffer - * and doing memcpy regardless of the size of Rx - * packet. Retry allocation to get back to - * normal. - */ - if (!rte_mempool_get(rxq->mprq_mp, - (void **)&rep)) - rxq->mprq_repl = rep; - } + /* Replace WQE if the buffer is still in use. */ + mprq_buf_replace(rxq, rq_ci & wq_mask); /* Advance to the next WQE. */ consumed_strd = 0; ++rq_ci; @@ -1667,122 +1642,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); if (rxq->crc_present) len -= RTE_ETHER_CRC_LEN; - offset = strd_idx * strd_sz + strd_shift; - addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); - hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz; - /* - * Memcpy packets to the target mbuf if: - * - The size of packet is smaller than mprq_max_memcpy_len. - * - Out of buffer in the Mempool for Multi-Packet RQ. - * - The packet's stride overlaps a headroom and scatter is off. - */ - if (len <= rxq->mprq_max_memcpy_len || - rxq->mprq_repl == NULL || - (hdrm_overlap > 0 && !rxq->strd_scatter_en)) { - if (likely(rte_pktmbuf_tailroom(pkt) >= len)) { - rte_memcpy(rte_pktmbuf_mtod(pkt, void *), - addr, len); - DATA_LEN(pkt) = len; - } else if (rxq->strd_scatter_en) { - struct rte_mbuf *prev = pkt; - uint32_t seg_len = - RTE_MIN(rte_pktmbuf_tailroom(pkt), len); - uint32_t rem_len = len - seg_len; - - rte_memcpy(rte_pktmbuf_mtod(pkt, void *), - addr, seg_len); - DATA_LEN(pkt) = seg_len; - while (rem_len) { - struct rte_mbuf *next = - rte_pktmbuf_alloc(rxq->mp); - - if (unlikely(next == NULL)) { - rte_pktmbuf_free(pkt); - ++rxq->stats.rx_nombuf; - goto out; - } - NEXT(prev) = next; - SET_DATA_OFF(next, 0); - addr = RTE_PTR_ADD(addr, seg_len); - seg_len = RTE_MIN - (rte_pktmbuf_tailroom(next), - rem_len); - rte_memcpy - (rte_pktmbuf_mtod(next, void *), - addr, seg_len); - DATA_LEN(next) = seg_len; - rem_len -= seg_len; - prev = next; - ++NB_SEGS(pkt); - } - } else { - rte_pktmbuf_free_seg(pkt); + rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf, + strd_idx, strd_cnt); + if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) { + rte_pktmbuf_free_seg(pkt); + if (rxq_code == MLX5_RXQ_CODE_DROPPED) { ++rxq->stats.idropped; continue; } - } else { - rte_iova_t buf_iova; - struct rte_mbuf_ext_shared_info *shinfo; - uint16_t buf_len = strd_cnt * strd_sz; - void *buf_addr; - - /* Increment the refcnt of the whole chunk. */ - __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED); - MLX5_ASSERT(__atomic_load_n(&buf->refcnt, - __ATOMIC_RELAXED) <= strd_n + 1); - buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); - /* - * MLX5 device doesn't use iova but it is necessary in a - * case where the Rx packet is transmitted via a - * different PMD. - */ - buf_iova = rte_mempool_virt2iova(buf) + - RTE_PTR_DIFF(buf_addr, buf); - shinfo = &buf->shinfos[strd_idx]; - rte_mbuf_ext_refcnt_set(shinfo, 1); - /* - * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when - * attaching the stride to mbuf and more offload flags - * will be added below by calling rxq_cq_to_mbuf(). - * Other fields will be overwritten. - */ - rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, - buf_len, shinfo); - /* Set mbuf head-room. */ - SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM); - MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); - MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >= - len - (hdrm_overlap > 0 ? hdrm_overlap : 0)); - DATA_LEN(pkt) = len; - /* - * Copy the last fragment of a packet (up to headroom - * size bytes) in case there is a stride overlap with - * a next packet's headroom. Allocate a separate mbuf - * to store this fragment and link it. Scatter is on. - */ - if (hdrm_overlap > 0) { - MLX5_ASSERT(rxq->strd_scatter_en); - struct rte_mbuf *seg = - rte_pktmbuf_alloc(rxq->mp); - - if (unlikely(seg == NULL)) { - rte_pktmbuf_free_seg(pkt); - ++rxq->stats.rx_nombuf; - break; - } - SET_DATA_OFF(seg, 0); - rte_memcpy(rte_pktmbuf_mtod(seg, void *), - RTE_PTR_ADD(addr, len - hdrm_overlap), - hdrm_overlap); - DATA_LEN(seg) = hdrm_overlap; - DATA_LEN(pkt) = len - hdrm_overlap; - NEXT(pkt) = seg; - NB_SEGS(pkt) = 2; + if (rxq_code == MLX5_RXQ_CODE_NOMBUF) { + ++rxq->stats.rx_nombuf; + break; } } rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); if (cqe->lro_num_seg > 1) { - mlx5_lro_update_hdr(addr, cqe, len); + mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *), + cqe, len); pkt->ol_flags |= PKT_RX_LRO; pkt->tso_segsz = len / cqe->lro_num_seg; } @@ -1796,7 +1672,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) *(pkts++) = pkt; ++i; } -out: /* Update the consumer indexes. */ rxq->consumed_strd = consumed_strd; rte_io_wmb(); @@ -1878,6 +1753,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, return 0; } +__rte_weak uint16_t +mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + __rte_weak int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) { diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index b243b6f28c..0eafa22d63 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -30,6 +30,7 @@ #include "mlx5_utils.h" #include "mlx5.h" #include "mlx5_autoconf.h" +#include "mlx5_mr.h" /* Support tunnel matching. */ #define MLX5_FLOW_TUNNEL 10 @@ -94,6 +95,12 @@ enum mlx5_rxq_err_state { MLX5_RXQ_ERR_STATE_NEED_READY, }; +enum mlx5_rqx_code { + MLX5_RXQ_CODE_EXIT = 0, + MLX5_RXQ_CODE_NOMBUF, + MLX5_RXQ_CODE_DROPPED, +}; + /* RX queue descriptor. */ struct mlx5_rxq_data { unsigned int csum:1; /* Enable checksum offloading. */ @@ -116,6 +123,7 @@ struct mlx5_rxq_data { volatile uint32_t *rq_db; volatile uint32_t *cq_db; uint16_t port_id; + uint32_t elts_ci; uint32_t rq_ci; uint16_t consumed_strd; /* Number of consumed strides in WQE. */ uint32_t rq_pi; @@ -130,11 +138,8 @@ struct mlx5_rxq_data { uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */ volatile void *wqes; volatile struct mlx5_cqe(*cqes)[]; - RTE_STD_C11 - union { - struct rte_mbuf *(*elts)[]; - struct mlx5_mprq_buf *(*mprq_bufs)[]; - }; + struct rte_mbuf *(*elts)[]; + struct mlx5_mprq_buf *(*mprq_bufs)[]; struct rte_mempool *mp; struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */ struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */ @@ -421,6 +426,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data); int mlx5_check_vec_rx_support(struct rte_eth_dev *dev); uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n); +uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); /* mlx5_mr.c */ @@ -681,4 +688,187 @@ mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts) return ci; } +/** + * Replace MPRQ buffer. + * + * @param rxq + * Pointer to Rx queue structure. + * @param rq_idx + * RQ index to replace. + */ +static __rte_always_inline void +mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx) +{ + const uint32_t strd_n = 1 << rxq->strd_num_n; + struct mlx5_mprq_buf *rep = rxq->mprq_repl; + volatile struct mlx5_wqe_data_seg *wqe = + &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx]; + void *addr; + + if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) > 1) { + MLX5_ASSERT(rep != NULL); + /* Replace MPRQ buf. */ + (*rxq->mprq_bufs)[rq_idx] = rep; + /* Replace WQE. */ + addr = mlx5_mprq_buf_addr(rep, strd_n); + wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); + /* If there's only one MR, no need to replace LKey in WQE. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) + wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); + /* Stash a mbuf for next replacement. */ + if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) + rxq->mprq_repl = rep; + else + rxq->mprq_repl = NULL; + /* Release the old buffer. */ + mlx5_mprq_buf_free(buf); + } else if (unlikely(rxq->mprq_repl == NULL)) { + struct mlx5_mprq_buf *rep; + + /* + * Currently, the MPRQ mempool is out of buffer + * and doing memcpy regardless of the size of Rx + * packet. Retry allocation to get back to + * normal. + */ + if (!rte_mempool_get(rxq->mprq_mp, (void **)&rep)) + rxq->mprq_repl = rep; + } +} + +/** + * Attach or copy MPRQ buffer content to a packet. + * + * @param rxq + * Pointer to Rx queue structure. + * @param pkt + * Pointer to a packet to fill. + * @param len + * Packet length. + * @param buf + * Pointer to a MPRQ buffer to take the data from. + * @param strd_idx + * Stride index to start from. + * @param strd_cnt + * Number of strides to consume. + */ +static __rte_always_inline enum mlx5_rqx_code +mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len, + struct mlx5_mprq_buf *buf, uint16_t strd_idx, uint16_t strd_cnt) +{ + const uint32_t strd_n = 1 << rxq->strd_num_n; + const uint16_t strd_sz = 1 << rxq->strd_sz_n; + const uint16_t strd_shift = + MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; + const int32_t hdrm_overlap = + len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz; + const uint32_t offset = strd_idx * strd_sz + strd_shift; + void *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); + + /* + * Memcpy packets to the target mbuf if: + * - The size of packet is smaller than mprq_max_memcpy_len. + * - Out of buffer in the Mempool for Multi-Packet RQ. + * - The packet's stride overlaps a headroom and scatter is off. + */ + if (len <= rxq->mprq_max_memcpy_len || + rxq->mprq_repl == NULL || + (hdrm_overlap > 0 && !rxq->strd_scatter_en)) { + if (likely(len <= + (uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) { + rte_memcpy(rte_pktmbuf_mtod(pkt, void *), + addr, len); + DATA_LEN(pkt) = len; + } else if (rxq->strd_scatter_en) { + struct rte_mbuf *prev = pkt; + uint32_t seg_len = RTE_MIN(len, (uint32_t) + (pkt->buf_len - RTE_PKTMBUF_HEADROOM)); + uint32_t rem_len = len - seg_len; + + rte_memcpy(rte_pktmbuf_mtod(pkt, void *), + addr, seg_len); + DATA_LEN(pkt) = seg_len; + while (rem_len) { + struct rte_mbuf *next = + rte_pktmbuf_alloc(rxq->mp); + + if (unlikely(next == NULL)) + return MLX5_RXQ_CODE_NOMBUF; + NEXT(prev) = next; + SET_DATA_OFF(next, 0); + addr = RTE_PTR_ADD(addr, seg_len); + seg_len = RTE_MIN(rem_len, (uint32_t) + (next->buf_len - RTE_PKTMBUF_HEADROOM)); + rte_memcpy + (rte_pktmbuf_mtod(next, void *), + addr, seg_len); + DATA_LEN(next) = seg_len; + rem_len -= seg_len; + prev = next; + ++NB_SEGS(pkt); + } + } else { + return MLX5_RXQ_CODE_DROPPED; + } + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; + uint16_t buf_len = strd_cnt * strd_sz; + void *buf_addr; + + /* Increment the refcnt of the whole chunk. */ + __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED); + MLX5_ASSERT(__atomic_load_n(&buf->refcnt, + __ATOMIC_RELAXED) <= strd_n + 1); + buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a + * different PMD. + */ + buf_iova = rte_mempool_virt2iova(buf) + + RTE_PTR_DIFF(buf_addr, buf); + shinfo = &buf->shinfos[strd_idx]; + rte_mbuf_ext_refcnt_set(shinfo, 1); + /* + * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when + * attaching the stride to mbuf and more offload flags + * will be added below by calling rxq_cq_to_mbuf(). + * Other fields will be overwritten. + */ + rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, + buf_len, shinfo); + /* Set mbuf head-room. */ + SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM); + MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF); + MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >= + len - (hdrm_overlap > 0 ? hdrm_overlap : 0)); + DATA_LEN(pkt) = len; + /* + * Copy the last fragment of a packet (up to headroom + * size bytes) in case there is a stride overlap with + * a next packet's headroom. Allocate a separate mbuf + * to store this fragment and link it. Scatter is on. + */ + if (hdrm_overlap > 0) { + MLX5_ASSERT(rxq->strd_scatter_en); + struct rte_mbuf *seg = + rte_pktmbuf_alloc(rxq->mp); + + if (unlikely(seg == NULL)) + return MLX5_RXQ_CODE_NOMBUF; + SET_DATA_OFF(seg, 0); + rte_memcpy(rte_pktmbuf_mtod(seg, void *), + RTE_PTR_ADD(addr, len - hdrm_overlap), + hdrm_overlap); + DATA_LEN(seg) = hdrm_overlap; + DATA_LEN(pkt) = len - hdrm_overlap; + NEXT(pkt) = seg; + NB_SEGS(pkt) = 2; + } + } + return MLX5_RXQ_CODE_EXIT; +} + #endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index aa48775738..469ea8401d 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -77,6 +77,177 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, return n; } +/** + * Replenish buffers for RX in bulk. + * + * @param rxq + * Pointer to RX queue structure. + */ +static inline void +mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq) +{ + const uint16_t q_n = 1 << rxq->elts_n; + const uint16_t q_mask = q_n - 1; + uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi); + uint16_t elts_idx = rxq->rq_ci & q_mask; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + volatile struct mlx5_wqe_data_seg *wq = + &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx]; + unsigned int i; + + if (n >= rxq->rq_repl_thresh) { + MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)); + MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) > + MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + for (i = 0; i < n; ++i) { + void *buf_addr; + + /* + * In order to support the mbufs with external attached + * data buffer we should use the buf_addr pointer + * instead of rte_mbuf_buf_addr(). It touches the mbuf + * itself and may impact the performance. + */ + buf_addr = elts[i]->buf_addr; + wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr + + RTE_PKTMBUF_HEADROOM); + /* If there's a single MR, no need to replace LKey. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) + > 1)) + wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); + } + rxq->rq_ci += n; + /* Prevent overflowing into consumed mbufs. */ + elts_idx = rxq->rq_ci & q_mask; + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + (*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf; + rte_io_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } +} + +/** + * Replenish buffers for MPRQ RX in bulk. + * + * @param rxq + * Pointer to RX queue structure. + */ +static inline void +mlx5_rx_mprq_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq) +{ + const uint16_t wqe_n = 1 << rxq->elts_n; + const uint32_t strd_n = 1 << rxq->strd_num_n; + const uint32_t elts_n = wqe_n * strd_n; + const uint32_t wqe_mask = elts_n - 1; + uint32_t n = elts_n - (rxq->elts_ci - rxq->rq_pi); + uint32_t elts_idx = rxq->elts_ci & wqe_mask; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + + /* Not to cross queue end. */ + if (n >= rxq->rq_repl_thresh) { + MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n)); + MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n) > + MLX5_VPMD_DESCS_PER_LOOP); + n = RTE_MIN(n, elts_n - elts_idx); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + rxq->elts_ci += n; + } +} + +/** + * Copy or attach MPRQ buffers to RX SW ring. + * + * @param rxq + * Pointer to RX queue structure. + * @param pkts + * Pointer to array of packets to be stored. + * @param pkts_n + * Number of packets to be stored. + * + * @return + * Number of packets successfully copied/attached (<= pkts_n). + */ +static inline uint16_t +rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, + struct rte_mbuf **pkts, uint16_t pkts_n) +{ + const uint16_t wqe_n = 1 << rxq->elts_n; + const uint16_t wqe_mask = wqe_n - 1; + const uint16_t strd_sz = 1 << rxq->strd_sz_n; + const uint32_t strd_n = 1 << rxq->strd_num_n; + const uint32_t elts_n = wqe_n * strd_n; + const uint32_t elts_mask = elts_n - 1; + uint32_t elts_idx = rxq->rq_pi & elts_mask; + struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; + uint32_t rq_ci = rxq->rq_ci; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask]; + uint16_t copied = 0; + uint16_t i = 0; + + for (i = 0; i < pkts_n; ++i) { + uint16_t strd_cnt; + enum mlx5_rqx_code rxq_code; + + if (rxq->consumed_strd == strd_n) { + /* Replace WQE if the buffer is still in use. */ + mprq_buf_replace(rxq, rq_ci & wqe_mask); + /* Advance to the next WQE. */ + rxq->consumed_strd = 0; + rq_ci++; + buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask]; + } + + if (!elts[i]->pkt_len) { + rxq->consumed_strd = strd_n; + rte_pktmbuf_free_seg(elts[i]); +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets -= 1; +#endif + continue; + } + strd_cnt = (elts[i]->pkt_len / strd_sz) + + ((elts[i]->pkt_len % strd_sz) ? 1 : 0); + rxq_code = mprq_buf_to_pkt(rxq, elts[i], elts[i]->pkt_len, + buf, rxq->consumed_strd, strd_cnt); + rxq->consumed_strd += strd_cnt; + if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) { + rte_pktmbuf_free_seg(elts[i]); +#ifdef MLX5_PMD_SOFT_COUNTERS + rxq->stats.ipackets -= 1; + rxq->stats.ibytes -= elts[i]->pkt_len; +#endif + if (rxq_code == MLX5_RXQ_CODE_NOMBUF) { + ++rxq->stats.rx_nombuf; + break; + } + if (rxq_code == MLX5_RXQ_CODE_DROPPED) { + ++rxq->stats.idropped; + continue; + } + } + pkts[copied++] = elts[i]; + } + rxq->rq_pi += i; + rxq->cq_ci += i; + rte_io_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + if (rq_ci != rxq->rq_ci) { + rxq->rq_ci = rq_ci; + rte_io_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + } + return copied; +} + /** * Receive burst of packets. An errored completion also consumes a mbuf, but the * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed @@ -204,7 +375,142 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) bool no_cq = false; do { - nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err, &no_cq); + nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, + &err, &no_cq); + if (unlikely(err | rxq->err_state)) + nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx); + tn += nb_rx; + if (unlikely(no_cq)) + break; + } while (tn != pkts_n); + return tn; +} + +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * @param[out] no_cq + * Pointer to a boolean. Set true if no new CQE seen. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, bool *no_cq) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + const uint16_t wqe_n = 1 << rxq->elts_n; + const uint32_t strd_n = 1 << rxq->strd_num_n; + const uint32_t elts_n = wqe_n * strd_n; + const uint32_t elts_mask = elts_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + uint16_t cp_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + + MLX5_ASSERT(rxq->sges_n == 0); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + mlx5_rx_mprq_replenish_bulk_mbuf(rxq); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt); + rxq->decompressed -= rcvd_pkt; + pkts += cp_pkt; + } + elts_idx = rxq->rq_pi & elts_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - cp_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, elts_n - elts_idx); + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + /* Not to move past the allocated mbufs. */ + pkts_n = RTE_MIN(pkts_n, rxq->elts_ci - rxq->rq_pi); + if (!pkts_n) { + *no_cq = !cp_pkt; + return cp_pkt; + } + /* At this point, there shouldn't be any remaining packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + /* Process all the CQEs */ + nocmp_n = rxq_cq_process_v(rxq, cq, elts, pkts, pkts_n, err, &comp_idx); + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { + *no_cq = true; + return cp_pkt; + } + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n); + rcvd_pkt += cp_pkt; + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n]); + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + cp_pkt = rxq_copy_mprq_mbuf_v(rxq, &pkts[cp_pkt], n); + rcvd_pkt += cp_pkt; + rxq->decompressed -= n; + } + } + *no_cq = !rcvd_pkt; + return rcvd_pkt; +} + +/** + * DPDK callback for vectorized MPRQ RX. + * + * @param dpdk_rxq + * Generic pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct mlx5_rxq_data *rxq = dpdk_rxq; + uint16_t nb_rx = 0; + uint16_t tn = 0; + uint64_t err = 0; + bool no_cq = false; + + do { + nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn, + &err, &no_cq); if (unlikely(err | rxq->err_state)) nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx); tn += nb_rx; @@ -229,8 +535,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq) struct mlx5_rxq_ctrl *ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); - if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv))) - return -ENOTSUP; if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0) return -ENOTSUP; if (rxq->lro) @@ -257,8 +561,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev) return -ENOTSUP; if (!priv->config.rx_vec_en) return -ENOTSUP; - if (mlx5_mprq_enabled(dev)) - return -ENOTSUP; /* All the configured queues should support. */ for (i = 0; i < priv->rxqs_n; ++i) { struct mlx5_rxq_data *rxq = (*priv->rxqs)[i]; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h index ce27074b08..93b4f517bb 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -12,7 +12,6 @@ #include #include "mlx5_autoconf.h" - #include "mlx5_mr.h" /* HW checksum offload capabilities of vectorized Tx. */ @@ -68,59 +67,4 @@ S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) == S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) == offsetof(struct mlx5_cqe, sop_drop_qpn) + 7); -/** - * Replenish buffers for RX in bulk. - * - * @param rxq - * Pointer to RX queue structure. - */ -static inline void -mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq) -{ - const uint16_t q_n = 1 << rxq->elts_n; - const uint16_t q_mask = q_n - 1; - uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi); - uint16_t elts_idx = rxq->rq_ci & q_mask; - struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; - volatile struct mlx5_wqe_data_seg *wq = - &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx]; - unsigned int i; - - if (n >= rxq->rq_repl_thresh) { - MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)); - MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) > - MLX5_VPMD_DESCS_PER_LOOP); - /* Not to cross queue end. */ - n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); - if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { - rxq->stats.rx_nombuf += n; - return; - } - for (i = 0; i < n; ++i) { - void *buf_addr; - - /* - * In order to support the mbufs with external attached - * data buffer we should use the buf_addr pointer - * instead of rte_mbuf_buf_addr(). It touches the mbuf - * itself and may impact the performance. - */ - buf_addr = elts[i]->buf_addr; - wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr + - RTE_PKTMBUF_HEADROOM); - /* If there's a single MR, no need to replace LKey. */ - if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) - > 1)) - wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); - } - rxq->rq_ci += n; - /* Prevent overflowing into consumed mbufs. */ - elts_idx = rxq->rq_ci & q_mask; - for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) - (*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf; - rte_io_wmb(); - *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); - } -} - #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */ -- 2.24.1