DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] net/mlx5: implement vectorized MPRQ burst
@ 2020-07-19  4:11 Alexander Kozyrev
  2020-10-21 20:30 ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Alexander Kozyrev
  0 siblings, 1 reply; 5+ messages in thread
From: Alexander Kozyrev @ 2020-07-19  4:11 UTC (permalink / raw)
  To: dev; +Cc: rasland, viacheslavo

MPRQ (Multi-Packet Rx Queue) processes one packet at the time
using simple scalar instructions. MPRQ works by posting a single
large buffer (consisted of multiple fixed-size strides) in order to
receive multiple packets at once on this buffer. A Rx packet is then
copied to a user-provided mbuf or PMD attaches the Rx packet to
the mbuf by the pointer to an external buffer.

There is an opportunity to speed up the packet receiving by processing
4 packets simultaneously using SIMD (single instruction, multiple data)
extensions. Allocate mbufs in batches for every MPRQ buffer and process
the packets in the groups of 4 until all the strides are exhausted. Then
switch to another MPRQ buffer and repeat the process over again.

The vectorized MPRQ burst routine is engaged automatically in case
the mprq_en=1 devarg is specified and the vectorization is not disabled
explicitly by providing rx_vec_en=0 devarg.  There are two limitations:
- LRO is not supported and scalar MPRQ is selected if it is on.
- CQE compression is disabled in case vectorized MPRQ is engaged.

Signed-off-by: Alexander Kozyrev <akozyrev@mellanox.com>
---
 drivers/net/mlx5/linux/mlx5_os.c         |   4 +
 drivers/net/mlx5/mlx5_ethdev.c           |  12 +-
 drivers/net/mlx5/mlx5_rxq.c              |  80 +--
 drivers/net/mlx5/mlx5_rxtx.c             |  30 +-
 drivers/net/mlx5/mlx5_rxtx.h             |   9 +-
 drivers/net/mlx5/mlx5_rxtx_vec.c         |  38 +-
 drivers/net/mlx5/mlx5_rxtx_vec.h         |  21 +
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 724 +++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 577 ++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 520 ++++++++++++++++
 10 files changed, 1968 insertions(+), 47 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 742e2fba49..927fa07270 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -568,6 +568,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		cqe_comp = 0;
 	else
 		cqe_comp = 1;
+	if (config.mprq.enabled)
+		cqe_comp = 0;
 	config.cqe_comp = cqe_comp;
 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
 	/* Whether device supports 128B Rx CQE padding. */
@@ -973,6 +975,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 				" setting default value (%u)",
 				1 << config.mprq.stride_num_n);
 		}
+		if (config.mprq.stride_size_n)
+			config.rx_vec_en = false;
 		if (config.mprq.stride_size_n &&
 		    (config.mprq.stride_size_n > mprq_max_stride_size_n ||
 		     config.mprq.stride_size_n < mprq_min_stride_size_n)) {
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index cefb45064e..f48e8ea293 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 
 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
 	    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
-	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
+	    dev->rx_pkt_burst == mlx5_rx_burst_vec ||
+	    dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)
 		return ptypes;
 	return NULL;
 }
@@ -479,12 +480,19 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
 	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
 
 	MLX5_ASSERT(dev != NULL);
-	if (mlx5_check_vec_rx_support(dev) > 0) {
+	if (mlx5_check_vec_rx_support(dev) > 0 &&
+		mlx5_mprq_enabled(dev)) {
+		rx_pkt_burst = mlx5_rx_burst_mprq_vec;
+		DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx vectorized function",
+			dev->data->port_id);
+	} else if (mlx5_check_vec_rx_support(dev) > 0) {
 		rx_pkt_burst = mlx5_rx_burst_vec;
 		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
 			dev->data->port_id);
 	} else if (mlx5_mprq_enabled(dev)) {
 		rx_pkt_burst = mlx5_rx_burst_mprq;
+		DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx function",
+			dev->data->port_id);
 	}
 	return rx_pkt_burst;
 }
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 67d996cabf..06e7650be9 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -164,7 +164,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 			rxq->mprq_repl = buf;
 	}
 	DRV_LOG(DEBUG,
-		"port %u Rx queue %u allocated and configured %u segments",
+		"port %u Multi-Packet Rx queue %u allocated and configured %u segments",
 		rxq->port_id, rxq->idx, wqe_n);
 	return 0;
 error:
@@ -176,7 +176,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 					(*rxq->mprq_bufs)[i]);
 		(*rxq->mprq_bufs)[i] = NULL;
 	}
-	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u failed, freed everything",
 		rxq->port_id, rxq->idx);
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -194,11 +194,14 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 static int
 rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
+	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
 	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
 	unsigned int i;
 	int err;
 
+	if (mlx5_rxq_mprq_enabled(rxq))
+		elts_n *= (1U << rxq_ctrl->rxq.strd_num_n);
 	/* Iterate on segments. */
 	for (i = 0; (i != elts_n); ++i) {
 		struct rte_mbuf *buf;
@@ -284,8 +287,10 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 int
 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
-	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
-	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
+	int ret = 0;
+	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+		ret = rxq_alloc_elts_mprq(rxq_ctrl);
+	return (ret || rxq_alloc_elts_sprq(rxq_ctrl));
 }
 
 /**
@@ -304,7 +309,6 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 		rxq->port_id, rxq->idx);
 	if (rxq->mprq_bufs == NULL)
 		return;
-	MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
 		if ((*rxq->mprq_bufs)[i] != NULL)
 			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
@@ -326,15 +330,19 @@ static void
 rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
-	const uint16_t q_n = (1 << rxq->elts_n);
-	const uint16_t q_mask = q_n - 1;
-	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
+	unsigned int q_n = (1 << rxq->elts_n);
+	uint16_t q_mask;
+	uint16_t used;
 	uint16_t i;
 
 	DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
 		PORT_ID(rxq_ctrl->priv), rxq->idx);
 	if (rxq->elts == NULL)
 		return;
+	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+		q_n *= (1U << rxq_ctrl->rxq.strd_num_n);
+	q_mask = q_n - 1;
+	used = q_n - (rxq->rq_ci - rxq->rq_pi);
 	/**
 	 * Some mbuf in the Ring belongs to the application.  They cannot be
 	 * freed.
@@ -344,7 +352,7 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
 		rxq->rq_pi = rxq->rq_ci;
 	}
-	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+	for (i = 0; (i != q_n); ++i) {
 		if ((*rxq->elts)[i] != NULL)
 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
 		(*rxq->elts)[i] = NULL;
@@ -362,8 +370,7 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
 		rxq_free_elts_mprq(rxq_ctrl);
-	else
-		rxq_free_elts_sprq(rxq_ctrl);
+	rxq_free_elts_sprq(rxq_ctrl);
 }
 
 /**
@@ -1793,20 +1800,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_rxq_ctrl *tmpl;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-	unsigned int mprq_stride_nums;
-	unsigned int mprq_stride_size;
-	unsigned int mprq_stride_cap;
 	struct mlx5_dev_config *config = &priv->config;
-	/*
-	 * Always allocate extra slots, even if eventually
-	 * the vector Rx will not be used.
-	 */
-	uint16_t desc_n =
-		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
 	uint64_t offloads = conf->offloads |
 			   dev->data->dev_conf.rxmode.offloads;
 	unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
-	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
 	unsigned int max_rx_pkt_len = lro_on_queue ?
 			dev->data->dev_conf.rxmode.max_lro_pkt_size :
 			dev->data->dev_conf.rxmode.max_rx_pkt_len;
@@ -1814,6 +1811,23 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 							RTE_PKTMBUF_HEADROOM;
 	unsigned int max_lro_size = 0;
 	unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
+	unsigned int mprq_stride_nums = config->mprq.stride_num_n ?
+		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
+	unsigned int mprq_stride_size = non_scatter_min_mbuf_size <=
+		(1U << config->mprq.max_stride_size_n) ?
+		log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
+	unsigned int mprq_stride_cap = (config->mprq.stride_num_n ?
+		(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
+			(config->mprq.stride_size_n ?
+		(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
+	/*
+	 * Always allocate extra slots, even if eventually
+	 * the vector Rx will not be used.
+	 */
+	uint16_t desc_n = desc +
+		config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP *
+		(desc >> mprq_stride_nums);
 
 	if (non_scatter_min_mbuf_size > mb_len && !(offloads &
 						    DEV_RX_OFFLOAD_SCATTER)) {
@@ -1825,8 +1839,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		rte_errno = ENOSPC;
 		return NULL;
 	}
-	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
-			   desc_n * sizeof(struct rte_mbuf *), 0, socket);
+	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+				 sizeof(*tmpl) +
+				 desc_n * sizeof(struct rte_mbuf *) +
+				 (desc >> mprq_stride_nums) *
+				 sizeof(struct mlx5_mprq_buf *),
+				 0, socket);
 	if (!tmpl) {
 		rte_errno = ENOMEM;
 		return NULL;
@@ -1840,15 +1858,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
-	mprq_stride_nums = config->mprq.stride_num_n ?
-		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
-	mprq_stride_size = non_scatter_min_mbuf_size <=
-		(1U << config->mprq.max_stride_size_n) ?
-		log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
-	mprq_stride_cap = (config->mprq.stride_num_n ?
-		(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
-			(config->mprq.stride_size_n ?
-		(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
 	/*
 	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
 	 * following conditions are met:
@@ -1996,7 +2005,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->rxq.rq_repl_thresh =
 		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
 	tmpl->rxq.elts =
-		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+		(struct rte_mbuf *(*)[desc_n])(tmpl + 1);
+	if (mlx5_rxq_mprq_enabled(&tmpl->rxq)) {
+		tmpl->rxq.rq_repl_thresh = 1;
+		tmpl->rxq.mprq_bufs =
+			(struct mlx5_mprq_buf *(*)[desc])(tmpl + desc_n + 1);
+	}
 #ifndef RTE_ARCH_64
 	tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;
 #endif
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 65239f9ffe..768a242518 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -614,6 +614,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
 #else
 		return -EINVAL;
+#endif
+	} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
+#if defined RTE_ARCH_X86_64
+		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector SSE");
+#elif defined RTE_ARCH_ARM64
+		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+		snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector AltiVec");
+#else
+		return -EINVAL;
 #endif
 	} else {
 		return -EINVAL;
@@ -1075,7 +1085,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 {
 	const uint16_t cqe_n = 1 << rxq->cqe_n;
 	const uint16_t cqe_mask = cqe_n - 1;
-	const unsigned int wqe_n = 1 << rxq->elts_n;
+	unsigned int wqe_n = 1 << rxq->elts_n;
 	struct mlx5_rxq_ctrl *rxq_ctrl =
 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 	union {
@@ -1139,11 +1149,17 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 						    &sm))
 				return -1;
 			if (vec) {
-				const uint16_t q_mask = wqe_n - 1;
+				uint16_t q_mask;
 				uint16_t elt_idx;
 				struct rte_mbuf **elt;
 				int i;
-				unsigned int n = wqe_n - (rxq->rq_ci -
+				unsigned int n;
+
+				if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+					wqe_n *= (1U <<
+						  rxq_ctrl->rxq.strd_num_n);
+				q_mask = wqe_n - 1;
+				n = wqe_n - (rxq->rq_ci -
 							  rxq->rq_pi);
 
 				for (i = 0; i < (int)n; ++i) {
@@ -1982,6 +1998,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
 	return 0;
 }
 
+__rte_weak uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
+		  struct rte_mbuf **pkts __rte_unused,
+		  uint16_t pkts_n __rte_unused)
+{
+	return 0;
+}
+
 __rte_weak int
 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 5116a15c33..3c44794d68 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -141,11 +141,8 @@ struct mlx5_rxq_data {
 	uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
 	volatile void *wqes;
 	volatile struct mlx5_cqe(*cqes)[];
-	RTE_STD_C11
-	union  {
-		struct rte_mbuf *(*elts)[];
-		struct mlx5_mprq_buf *(*mprq_bufs)[];
-	};
+	struct rte_mbuf *(*elts)[];
+	struct mlx5_mprq_buf *(*mprq_bufs)[];
 	struct rte_mempool *mp;
 	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
 	struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
@@ -518,6 +515,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 			   uint16_t pkts_n);
+uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+			   uint16_t pkts_n);
 
 /* mlx5_mr.c */
 
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 7fae2010f9..53dd229271 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -119,6 +119,40 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	return tn;
 }
 
+/**
+ * DPDK callback for MPRQ vectorized RX.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
+	uint16_t nb_rx = 0;
+	uint16_t tn = 0;
+	uint64_t err = 0;
+	bool no_cq = false;
+
+	do {
+		nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,
+					 &err, &no_cq);
+		if (unlikely(err | rxq->err_state))
+			nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
+		tn += nb_rx;
+		if (unlikely(no_cq))
+			break;
+	} while (tn != pkts_n);
+	return tn;
+}
+
 /**
  * Check a RX queue can support vectorized RX.
  *
@@ -134,8 +168,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
 	struct mlx5_rxq_ctrl *ctrl =
 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 
-	if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
-		return -ENOTSUP;
 	if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
 		return -ENOTSUP;
 	if (rxq->lro)
@@ -160,8 +192,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
 
 	if (!priv->config.rx_vec_en)
 		return -ENOTSUP;
-	if (mlx5_mprq_enabled(dev))
-		return -ENOTSUP;
 	/* All the configured queues should support. */
 	for (i = 0; i < priv->rxqs_n; ++i) {
 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index 6ddcbfb0ad..305c5a596a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -122,4 +122,25 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
 }
 
+static inline void
+mlx5_rx_replenish_bulk_mprq_mbuf(struct mlx5_rxq_data *rxq,
+				 uint16_t n, uint32_t rq_idx)
+{
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	uint16_t elts_idx = rq_idx * strd_n +
+		rq_idx * MLX5_VPMD_DESCS_PER_LOOP;
+	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+	unsigned int i;
+
+	n = RTE_MIN(n, strd_n - rxq->consumed_strd);
+	if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+		rxq->stats.rx_nombuf += n;
+		return;
+	}
+	rxq->rq_repl_thresh = 0;
+	/* Prevent overflowing into the next MPRQ mbufs. */
+	for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+		(*rxq->elts)[elts_idx + strd_n + i] = &rxq->fake_mbuf;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index f5414eebad..8fc3e1fd66 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -59,6 +59,97 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
 		pkts[pos] = elts[pos];
 }
 
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param buf
+ *   MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ *   WQE index.
+ * @param strd_idx
+ *   Stride number.
+ * @param comp
+ *   Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		     uint16_t n, struct mlx5_mprq_buf *buf,
+		     uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const unsigned int strd_shift =
+		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+	uint32_t offset;
+	void *addr;
+	int i = 0;
+
+	if (comp) {
+		const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+		struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+		unsigned int pos;
+		uint16_t p = n & -2;
+
+		for (pos = 0; pos < p; pos += 2) {
+			vector unsigned char mbp;
+
+			mbp = (vector unsigned char)vec_vsx_ld(0,
+				(signed int const *)&elts[pos +
+							  rxq->consumed_strd]);
+			*(vector unsigned char *)&pkts[pos] = mbp;
+		}
+		if (n & 1)
+			pkts[pos] = elts[pos];
+	}
+
+	for (i = 0; i < n; ++i) {
+		offset = (strd_idx + i) * strd_sz + strd_shift;
+		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+		if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+		    rxq->mprq_repl == NULL) {
+			rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+				   addr, pkts[i]->pkt_len);
+		} else {
+			rte_iova_t buf_iova;
+			struct rte_mbuf_ext_shared_info *shinfo;
+			uint16_t buf_len = strd_sz;
+			void *buf_addr;
+			/* Increment the refcnt of the whole chunk. */
+			rte_atomic16_add_return(&buf->refcnt, 1);
+			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+					strd_n + 1);
+			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+			/*
+			 * MLX5 device doesn't use iova but it is necessary in a
+			 * case where the Rx packet is transmitted via a
+			 * different PMD.
+			 */
+			buf_iova = rte_mempool_virt2iova(buf) +
+				RTE_PTR_DIFF(buf_addr, buf);
+			shinfo = &buf->shinfos[strd_idx];
+			rte_mbuf_ext_refcnt_set(shinfo, 1);
+			/*
+			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+			 * attaching the stride to mbuf and more offload flags
+			 * will be added below by calling rxq_cq_to_mbuf().
+			 * Other fields will be overwritten.
+			 */
+			rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+						buf_len, shinfo);
+			/* Set mbuf head-room. */
+			SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+			DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+		}
+	}
+}
+
+
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -1136,4 +1227,637 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	return rcvd_pkt;
 }
 
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+		 const unsigned int strd_n)
+{
+	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+	volatile struct mlx5_wqe_data_seg *wqe =
+		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+	void *addr;
+
+	MLX5_ASSERT(rep != NULL);
+	/* Replace MPRQ buf. */
+	(*rxq->mprq_bufs)[rq_idx] = rep;
+	/* Replace WQE. */
+	addr = mlx5_mprq_buf_addr(rep, strd_n);
+	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+	/* If there's only one MR, no need to replace LKey in WQE. */
+	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+	/* Stash a mbuf for next replacement. */
+	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+		rxq->mprq_repl = rep;
+	else
+		rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *  Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	const uint16_t e_n = 1 << rxq->elts_n;
+	const uint16_t e_mask = e_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_ci = rxq->cq_ci;
+	unsigned int cq_idx = cq_ci & q_mask;
+	unsigned int rq_ci = rxq->rq_ci;
+	unsigned int rq_idx = rq_ci & e_mask;
+	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+	unsigned int elts_idx;
+	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+	const vector unsigned char zero = (vector unsigned char){0};
+	const vector unsigned char ones = vec_splat_u8(-1);
+	const vector unsigned char owner_check =
+		(vector unsigned char)(vector unsigned long){
+		0x0100000001000000LL, 0x0100000001000000LL};
+	const vector unsigned char opcode_check =
+		(vector unsigned char)(vector unsigned long){
+		0xf0000000f0000000LL, 0xf0000000f0000000LL};
+	const vector unsigned char format_check =
+		(vector unsigned char)(vector unsigned long){
+		0x0c0000000c000000LL, 0x0c0000000c000000LL};
+	const vector unsigned char resp_err_check =
+		(vector unsigned char)(vector unsigned long){
+		0xe0000000e0000000LL, 0xe0000000e0000000LL};
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const vector unsigned char len_shuf_mask = (vector unsigned char){
+		 1,  0,  5,  4,
+		 9,  8, 13, 12,
+		-1, -1, -1, -1,
+		-1, -1, -1, -1};
+#endif
+	/* Mask to shuffle from extracted CQE to mbuf. */
+	const vector unsigned char shuf_mask = (vector unsigned char){
+		 5,  4,           /* bswap16, pkt_len */
+		-1, -1,           /* zero out 2nd half of pkt_len */
+		 5,  4,           /* bswap16, data_len */
+		11, 10,           /* bswap16, vlan+tci */
+		15, 14, 13, 12,   /* bswap32, rss */
+		 1,  2,  3, -1};  /* fdir.hi */
+	/* Mask to blend from the last Qword to the first DQword. */
+	/* Mask to blend from the last Qword to the first DQword. */
+	const vector unsigned char blend_mask = (vector unsigned char){
+		-1,  0,  0,  0,
+		 0,  0,  0,  0,
+		-1, -1, -1, -1,
+		-1, -1, -1, -1};
+	const vector unsigned char crc_adj =
+		(vector unsigned char)(vector unsigned short){
+		rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+		rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0};
+	const vector unsigned char flow_mark_adj =
+		(vector unsigned char)(vector unsigned int){
+		0, 0, 0, rxq->mark * (-1)};
+	const vector unsigned short cqe_sel_mask1 =
+		(vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+	const vector unsigned short cqe_sel_mask2 =
+		(vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+	if (rxq->consumed_strd == strd_n) {
+		/* Replace WQE only if the buffer is still in use. */
+		if (rte_atomic16_read(&buf->refcnt) > 1) {
+			mprq_buf_replace(rxq, rq_ci & e_mask, strd_n);
+			/* Release the old buffer. */
+			mlx5_mprq_buf_free(buf);
+		} else if (unlikely(rxq->mprq_repl == NULL)) {
+			struct mlx5_mprq_buf *rep;
+
+			/*
+			 * Currently, the MPRQ mempool is out of buffer
+			 * and doing memcpy regardless of the size of Rx
+			 * packet. Retry allocation to get back to
+			 * normal.
+			 */
+			if (!rte_mempool_get(rxq->mprq_mp,
+					     (void **)&rep))
+				rxq->mprq_repl = rep;
+		}
+		/* Advance to the next WQE. */
+		rxq->consumed_strd = 0;
+		++rq_ci;
+		buf = (*rxq->mprq_bufs)[rq_ci & e_mask];
+		rxq->rq_repl_thresh = 1;
+	}
+	if (rxq->rq_repl_thresh)
+		mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);
+
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch0(cq);
+	rte_prefetch0(cq + 1);
+	rte_prefetch0(cq + 2);
+	rte_prefetch0(cq + 3);
+	elts_idx = (rq_ci & e_mask) * strd_n +
+		(rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+	elts = &(*rxq->elts)[elts_idx];
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+				     rq_ci, rxq->consumed_strd, true);
+		rxq->consumed_strd += rcvd_pkt;
+		rxq->rq_pi += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+		pkts += rcvd_pkt;
+	}
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
+		return rcvd_pkt;
+	}
+	/* At this point, there shouldn't be any remaining packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+
+	/*
+	 * A. load first Qword (8bytes) in one loop.
+	 * B. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * C. load remaining CQE data and extract necessary fields.
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  flow_tag[3];
+	 *          uint16_t byte_cnt;
+	 *          uint8_t  rsvd4;
+	 *          uint8_t  op_own;
+	 *          uint16_t hdr_type_etc;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *        } c;
+	 * D. fill in mbuf.
+	 * E. get valid CQEs.
+	 * F. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP];
+		vector unsigned char cqe_tmp1, cqe_tmp2;
+		vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		vector unsigned char op_own, op_own_tmp1, op_own_tmp2;
+		vector unsigned char opcode, owner_mask, invalid_mask;
+		vector unsigned char comp_mask;
+		vector unsigned char mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		const vector unsigned char lower_half = {
+			0, 1, 4, 5, 8, 9, 12, 13,
+			16, 17, 20, 21, 24, 25, 28, 29};
+		const vector unsigned char upper_half = {
+			2, 3, 6, 7, 10, 11, 14, 15,
+			18, 19, 22, 23, 26, 27, 30, 31};
+		const vector unsigned long shmax = {64, 64};
+		vector unsigned char byte_cnt;
+		vector unsigned short left, right;
+		vector unsigned long lshift;
+		vector __attribute__((altivec(bool__)))
+			unsigned long shmask;
+#endif
+		vector unsigned char mbp1, mbp2;
+		vector unsigned char p =
+			(vector unsigned char)(vector unsigned short){
+				0, 1, 2, 3, 0, 0, 0, 0};
+		unsigned int p1, p2, p3;
+
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+		}
+
+		/* A.0 do not cross the end of CQ. */
+		mask = (vector unsigned char)(vector unsigned long){
+			(pkts_n - pos) * sizeof(uint16_t) * 8, 0};
+
+		{
+			vector unsigned long lshift;
+			vector __attribute__((altivec(bool__)))
+				unsigned long shmask;
+			const vector unsigned long shmax = {64, 64};
+
+			lshift = vec_splat((vector unsigned long)mask, 0);
+			shmask = vec_cmpgt(shmax, lshift);
+			mask = (vector unsigned char)
+				vec_sl((vector unsigned long)ones, lshift);
+			mask = (vector unsigned char)
+				vec_sel((vector unsigned long)shmask,
+				(vector unsigned long)mask, shmask);
+		}
+
+		p = (vector unsigned char)
+			vec_andc((vector unsigned long)p,
+			(vector unsigned long)mask);
+
+		/* A.1 load cqes. */
+		p3 = (unsigned int)((vector unsigned short)p)[3];
+		cqes[3] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p3].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		p2 = (unsigned int)((vector unsigned short)p)[2];
+		cqes[2] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p2].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		/* B.1 load mbuf pointers. */
+		mbp1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&elts[pos + rxq->consumed_strd]);
+		mbp2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&elts[pos +
+						  rxq->consumed_strd + 2]);
+
+		/* A.1 load a block having op_own. */
+		p1 = (unsigned int)((vector unsigned short)p)[1];
+		cqes[1] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p1].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		cqes[0] = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos].sop_drop_qpn, 0LL};
+		rte_compiler_barrier();
+
+		/* B.2 copy mbuf pointers. */
+		*(vector unsigned char *)&pkts[pos] = mbp1;
+		*(vector unsigned char *)&pkts[pos + 2] = mbp2;
+		rte_cio_rmb();
+
+		/* C.1 load remaining CQE data and extract necessary fields. */
+		cqe_tmp2 = *(vector unsigned char *)
+			&cq[pos + p3].pkt_info;
+		cqe_tmp1 = *(vector unsigned char *)
+			&cq[pos + p2].pkt_info;
+		cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask);
+		cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask);
+		cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos + p3].csum);
+		cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos + p2].csum);
+		cqes[3] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[3],
+			(vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+		cqes[2] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[2],
+			(vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+		cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p3].rsvd3[9], 0LL};
+		cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p2].rsvd3[9], 0LL};
+		cqes[3] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[3],
+			(vector unsigned short)cqe_tmp2,
+			(vector unsigned short)cqe_sel_mask2);
+		cqes[2] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[2],
+			(vector unsigned short)cqe_tmp1,
+			(vector unsigned short)cqe_sel_mask2);
+
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask);
+		pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask);
+
+		/* C.3 adjust CRC length. */
+		pkt_mb3 = (vector unsigned char)
+			((vector unsigned short)pkt_mb3 -
+			(vector unsigned short)crc_adj);
+		pkt_mb2 = (vector unsigned char)
+			((vector unsigned short)pkt_mb2 -
+			(vector unsigned short)crc_adj);
+
+		/* C.4 adjust flow mark. */
+		pkt_mb3 = (vector unsigned char)
+			((vector unsigned int)pkt_mb3 +
+			(vector unsigned int)flow_mark_adj);
+		pkt_mb2 = (vector unsigned char)
+			((vector unsigned int)pkt_mb2 +
+			(vector unsigned int)flow_mark_adj);
+
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		*(vector unsigned char *)
+			&pkts[pos + 3]->pkt_len = pkt_mb3;
+		*(vector unsigned char *)
+			&pkts[pos + 2]->pkt_len = pkt_mb2;
+
+		/* E.1 extract op_own field. */
+		op_own_tmp2 = (vector unsigned char)
+			vec_mergeh((vector unsigned int)cqes[2],
+			(vector unsigned int)cqes[3]);
+
+		/* C.1 load remaining CQE data and extract necessary fields. */
+		cqe_tmp2 = *(vector unsigned char *)
+			&cq[pos + p1].pkt_info;
+		cqe_tmp1 = *(vector unsigned char *)
+			&cq[pos].pkt_info;
+		cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask);
+		cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask);
+		cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos + p1].csum);
+		cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+			(signed int const *)&cq[pos].csum);
+		cqes[1] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[1],
+			(vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+		cqes[0] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[0],
+			(vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+		cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos + p1].rsvd3[9], 0LL};
+		cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+			*(__rte_aligned(8) unsigned long *)
+			&cq[pos].rsvd3[9], 0LL};
+		cqes[1] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[1],
+			(vector unsigned short)cqe_tmp2, cqe_sel_mask2);
+		cqes[0] = (vector unsigned char)
+			vec_sel((vector unsigned short)cqes[0],
+			(vector unsigned short)cqe_tmp1, cqe_sel_mask2);
+
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask);
+		pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask);
+
+		/* C.3 adjust CRC length. */
+		pkt_mb1 = (vector unsigned char)
+			((vector unsigned short)pkt_mb1 -
+			(vector unsigned short)crc_adj);
+		pkt_mb0 = (vector unsigned char)
+			((vector unsigned short)pkt_mb0 -
+			(vector unsigned short)crc_adj);
+
+		/* C.4 adjust flow mark. */
+		pkt_mb1 = (vector unsigned char)
+			((vector unsigned int)pkt_mb1 +
+			(vector unsigned int)flow_mark_adj);
+		pkt_mb0 = (vector unsigned char)
+			((vector unsigned int)pkt_mb0 +
+			(vector unsigned int)flow_mark_adj);
+
+		/* E.1 extract op_own byte. */
+		op_own_tmp1 = (vector unsigned char)
+			vec_mergeh((vector unsigned int)cqes[0],
+			(vector unsigned int)cqes[1]);
+		op_own = (vector unsigned char)
+			vec_mergel((vector unsigned long)op_own_tmp1,
+			(vector unsigned long)op_own_tmp2);
+
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		*(vector unsigned char *)
+			&pkts[pos + 1]->pkt_len = pkt_mb1;
+		*(vector unsigned char *)
+			&pkts[pos]->pkt_len = pkt_mb0;
+
+		/* E.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = (vector unsigned char)
+			vec_and((vector unsigned long)op_own,
+			(vector unsigned long)owner_check);
+		if (ownership)
+			owner_mask = (vector unsigned char)
+				vec_xor((vector unsigned long)owner_mask,
+				(vector unsigned long)owner_check);
+		owner_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)owner_mask,
+			(vector unsigned int)owner_check);
+		owner_mask = (vector unsigned char)
+			vec_packs((vector unsigned int)owner_mask,
+			(vector unsigned int)zero);
+
+		/* E.3 get mask for invalidated CQEs. */
+		opcode = (vector unsigned char)
+			vec_and((vector unsigned long)op_own,
+			(vector unsigned long)opcode_check);
+		invalid_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)opcode_check,
+			(vector unsigned int)opcode);
+		invalid_mask = (vector unsigned char)
+			vec_packs((vector unsigned int)invalid_mask,
+			(vector unsigned int)zero);
+
+		/* E.4 mask out beyond boundary. */
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)mask);
+
+		/* E.5 merge invalid_mask with invalid owner. */
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)owner_mask);
+
+		/* F.1 find compressed CQE format. */
+		comp_mask = (vector unsigned char)
+			vec_and((vector unsigned long)op_own,
+			(vector unsigned long)format_check);
+		comp_mask = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)comp_mask,
+			(vector unsigned int)format_check);
+		comp_mask = (vector unsigned char)
+			vec_packs((vector unsigned int)comp_mask,
+			(vector unsigned int)zero);
+
+		/* F.2 mask out invalid entries. */
+		comp_mask = (vector unsigned char)
+			vec_andc((vector unsigned long)comp_mask,
+			(vector unsigned long)invalid_mask);
+		comp_idx = ((vector unsigned long)comp_mask)[0];
+
+		/* F.3 get the first compressed CQE. */
+		comp_idx = comp_idx ? __builtin_ctzll(comp_idx) /
+			(sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP;
+
+		/* E.6 mask out entries after the compressed CQE. */
+		mask = (vector unsigned char)(vector unsigned long){
+			(comp_idx * sizeof(uint16_t) * 8), 0};
+		lshift = vec_splat((vector unsigned long)mask, 0);
+		shmask = vec_cmpgt(shmax, lshift);
+		mask = (vector unsigned char)
+			vec_sl((vector unsigned long)ones, lshift);
+		mask = (vector unsigned char)
+			vec_sel((vector unsigned long)shmask,
+			(vector unsigned long)mask, shmask);
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)mask);
+
+		/* E.7 count non-compressed valid CQEs. */
+		n = ((vector unsigned long)invalid_mask)[0];
+		n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+			MLX5_VPMD_DESCS_PER_LOOP;
+		nocmp_n += n;
+
+		/* D.2 get the final invalid mask. */
+		mask = (vector unsigned char)(vector unsigned long){
+			(n * sizeof(uint16_t) * 8), 0};
+		lshift = vec_splat((vector unsigned long)mask, 0);
+		shmask = vec_cmpgt(shmax, lshift);
+		mask = (vector unsigned char)
+			vec_sl((vector unsigned long)ones, lshift);
+		mask = (vector unsigned char)
+			vec_sel((vector unsigned long)shmask,
+			(vector unsigned long)mask, shmask);
+		invalid_mask = (vector unsigned char)
+			vec_or((vector unsigned long)invalid_mask,
+			(vector unsigned long)mask);
+
+		/* D.3 check error in opcode. */
+		opcode = (vector unsigned char)
+			vec_cmpeq((vector unsigned int)resp_err_check,
+			(vector unsigned int)opcode);
+		opcode = (vector unsigned char)
+			vec_packs((vector unsigned int)opcode,
+			(vector unsigned int)zero);
+		opcode = (vector unsigned char)
+			vec_andc((vector unsigned long)opcode,
+			(vector unsigned long)invalid_mask);
+
+		/* D.4 mark if any error is set */
+		*err |= ((vector unsigned long)opcode)[0];
+
+		/* D.5 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+		if (rxq->hw_timestamp) {
+			pkts[pos]->timestamp =
+				rte_be_to_cpu_64(cq[pos].timestamp);
+			pkts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p1].timestamp);
+			pkts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p2].timestamp);
+			pkts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p3].timestamp);
+		}
+		if (rxq->dynf_meta) {
+			uint64_t flag = rxq->flow_meta_mask;
+			int32_t offs = rxq->flow_meta_offset;
+			uint32_t metadata;
+
+			/* This code is subject for futher optimization. */
+			metadata = cq[pos].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+								metadata;
+			pkts[pos]->ol_flags |= metadata ? flag : 0ULL;
+			metadata = cq[pos + 1].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+								metadata;
+			pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL;
+			metadata = cq[pos + 2].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+								metadata;
+			pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL;
+			metadata = cq[pos + 3].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+								metadata;
+			pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL;
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = vec_perm(op_own, zero, len_shuf_mask);
+		byte_cnt = (vector unsigned char)
+			vec_andc((vector unsigned long)byte_cnt,
+			(vector unsigned long)invalid_mask);
+		left = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, lower_half);
+		right = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, upper_half);
+		byte_cnt = (vector unsigned char)vec_add(left, right);
+		left = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, lower_half);
+		right = vec_perm((vector unsigned short)byte_cnt,
+			(vector unsigned short)zero, upper_half);
+		byte_cnt = (vector unsigned char)vec_add(left, right);
+		rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
+		return rcvd_pkt;
+	}
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+			     rq_ci, rxq->consumed_strd, false);
+	rxq->cq_ci += nocmp_n;
+	rxq->consumed_strd += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed =
+			rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+					     rq_ci, rxq->consumed_strd, true);
+			rxq->consumed_strd += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+	rte_compiler_barrier();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	if (rq_ci != rxq->rq_ci) {
+		rxq->rq_ci = rq_ci;
+		rte_cio_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	}
+	*no_cq = !rcvd_pkt;
+	return rcvd_pkt;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 555c342626..53c8ed8a9b 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -54,6 +54,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
 		pkts[pos] = elts[pos];
 }
 
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param buf
+ *   MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ *   WQE index.
+ * @param strd_idx
+ *   Stride number.
+ * @param comp
+ *   Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		     uint16_t n, struct mlx5_mprq_buf *buf,
+		     uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const unsigned int strd_shift =
+		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+	uint32_t offset;
+	void *addr;
+	int i = 0;
+
+	if (comp) {
+		const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+		struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+		unsigned int pos;
+		uint16_t p = n & -2;
+
+		for (pos = 0; pos < p; pos += 2) {
+			uint64x2_t mbp;
+
+			mbp = vld1q_u64((void *)&elts[pos +
+						      rxq->consumed_strd]);
+			vst1q_u64((void *)&pkts[pos], mbp);
+		}
+		if (n & 1)
+			pkts[pos] = elts[pos];
+	}
+
+	for (i = 0; i < n; ++i) {
+		offset = (strd_idx + i) * strd_sz + strd_shift;
+		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+		if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+		    rxq->mprq_repl == NULL) {
+			rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+				   addr, pkts[i]->pkt_len);
+		} else {
+			rte_iova_t buf_iova;
+			struct rte_mbuf_ext_shared_info *shinfo;
+			uint16_t buf_len = strd_sz;
+			void *buf_addr;
+			/* Increment the refcnt of the whole chunk. */
+			rte_atomic16_add_return(&buf->refcnt, 1);
+			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+				    strd_n + 1);
+			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+			/*
+			 * MLX5 device doesn't use iova but it is necessary in a
+			 * case where the Rx packet is transmitted via a
+			 * different PMD.
+			 */
+			buf_iova = rte_mempool_virt2iova(buf) +
+				RTE_PTR_DIFF(buf_addr, buf);
+			shinfo = &buf->shinfos[strd_idx];
+			rte_mbuf_ext_refcnt_set(shinfo, 1);
+			/*
+			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+			 * attaching the stride to mbuf and more offload flags
+			 * will be added below by calling rxq_cq_to_mbuf().
+			 * Other fields will be overwritten.
+			 */
+			rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+						  buf_len, shinfo);
+			/* Set mbuf head-room. */
+			SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+			DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+		}
+	}
+}
+
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -806,4 +895,492 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	return rcvd_pkt;
 }
 
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+		 const unsigned int strd_n)
+{
+	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+	volatile struct mlx5_wqe_data_seg *wqe =
+		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+	void *addr;
+
+	MLX5_ASSERT(rep != NULL);
+	/* Replace MPRQ buf. */
+	(*rxq->mprq_bufs)[rq_idx] = rep;
+	/* Replace WQE. */
+	addr = mlx5_mprq_buf_addr(rep, strd_n);
+	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+	/* If there's only one MR, no need to replace LKey in WQE. */
+	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+	/* Stash a mbuf for next replacement. */
+	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+		rxq->mprq_repl = rep;
+	else
+		rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	const uint16_t e_n = 1 << rxq->elts_n;
+	const uint16_t e_mask = e_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_ci = rxq->cq_ci;
+	unsigned int cq_idx = cq_ci & q_mask;
+	unsigned int rq_ci = rxq->rq_ci;
+	unsigned int rq_idx = rq_ci & e_mask;
+	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+	unsigned int elts_idx;
+	const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+	const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
+	const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
+	const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
+	const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+#endif
+	/* Mask to generate 16B length vector. */
+	const uint8x8_t len_shuf_m = {
+		52, 53,         /* 4th CQE */
+		36, 37,         /* 3rd CQE */
+		20, 21,         /* 2nd CQE */
+		 4,  5          /* 1st CQE */
+	};
+	/* Mask to extract 16B data from a 64B CQE. */
+	const uint8x16_t cqe_shuf_m = {
+		28, 29,         /* hdr_type_etc */
+		 0,             /* pkt_info */
+		-1,             /* null */
+		47, 46,         /* byte_cnt, bswap16 */
+		31, 30,         /* vlan_info, bswap16 */
+		15, 14, 13, 12, /* rx_hash_res, bswap32 */
+		57, 58, 59,     /* flow_tag */
+		63              /* op_own */
+	};
+	/* Mask to generate 16B data for mbuf. */
+	const uint8x16_t mb_shuf_m = {
+		 4,  5, -1, -1, /* pkt_len */
+		 4,  5,         /* data_len */
+		 6,  7,         /* vlan_tci */
+		 8,  9, 10, 11, /* hash.rss */
+		12, 13, 14, -1  /* hash.fdir.hi */
+	};
+	/* Mask to generate 16B owner vector. */
+	const uint8x8_t owner_shuf_m = {
+		63, -1,         /* 4th CQE */
+		47, -1,         /* 3rd CQE */
+		31, -1,         /* 2nd CQE */
+		15, -1          /* 1st CQE */
+	};
+	/* Mask to generate a vector having packet_type/ol_flags. */
+	const uint8x16_t ptype_shuf_m = {
+		48, 49, 50, -1, /* 4th CQE */
+		32, 33, 34, -1, /* 3rd CQE */
+		16, 17, 18, -1, /* 2nd CQE */
+		 0,  1,  2, -1  /* 1st CQE */
+	};
+	/* Mask to generate a vector having flow tags. */
+	const uint8x16_t ftag_shuf_m = {
+		60, 61, 62, -1, /* 4th CQE */
+		44, 45, 46, -1, /* 3rd CQE */
+		28, 29, 30, -1, /* 2nd CQE */
+		12, 13, 14, -1  /* 1st CQE */
+	};
+	const uint16x8_t crc_adj = {
+		0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0
+	};
+	const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+	if (rxq->consumed_strd == strd_n) {
+		/* Replace WQE only if the buffer is still in use. */
+		if (rte_atomic16_read(&buf->refcnt) > 1) {
+			mprq_buf_replace(rxq, rq_idx, strd_n);
+			/* Release the old buffer. */
+			mlx5_mprq_buf_free(buf);
+		} else if (unlikely(rxq->mprq_repl == NULL)) {
+			struct mlx5_mprq_buf *rep;
+
+			/*
+			 * Currently, the MPRQ mempool is out of buffer
+			 * and doing memcpy regardless of the size of Rx
+			 * packet. Retry allocation to get back to
+			 * normal.
+			 */
+			if (!rte_mempool_get(rxq->mprq_mp,
+					     (void **)&rep))
+				rxq->mprq_repl = rep;
+			}
+		/* Advance to the next WQE. */
+		rxq->consumed_strd = 0;
+		++rq_ci;
+		rq_idx = rq_ci & e_mask;
+		buf = (*rxq->mprq_bufs)[rq_idx];
+		rxq->rq_repl_thresh = 1;
+	}
+	if (rxq->rq_repl_thresh)
+		mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_idx);
+
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch_non_temporal(cq);
+	rte_prefetch_non_temporal(cq + 1);
+	rte_prefetch_non_temporal(cq + 2);
+	rte_prefetch_non_temporal(cq + 3);
+	elts_idx = (rq_ci & e_mask) * strd_n +
+		(rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+	elts = &(*rxq->elts)[elts_idx];
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+				     rq_ci, rxq->consumed_strd, true);
+		rxq->consumed_strd += rcvd_pkt;
+		pkts += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+	}
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
+		return rcvd_pkt;
+	}
+	/* At this point, there shouldn't be any remained packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+	/*
+	 * Note that vectors have reverse order - {v3, v2, v1, v0}, because
+	 * there's no instruction to count trailing zeros. __builtin_clzl() is
+	 * used instead.
+	 *
+	 * A. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * B. load 64B CQE and extract necessary fields
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint16_t hdr_type_etc;
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  rsvd;
+	 *          uint16_t byte_cnt;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *          uint8_t  flow_tag[3];
+	 *          uint8_t  op_own;
+	 *        } c;
+	 * C. fill in mbuf.
+	 * D. get valid CQEs.
+	 * E. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		uint16x4_t op_own;
+		uint16x4_t opcode, owner_mask, invalid_mask;
+		uint16x4_t comp_mask;
+		uint16x4_t mask;
+		uint16x4_t byte_cnt;
+		uint32x4_t ptype_info, flow_tag;
+		register uint64x2_t c0, c1, c2, c3;
+		uint8_t *p0, *p1, *p2, *p3;
+		uint8_t *e0 = (void *)&elts[pos + rxq->consumed_strd]->pkt_len;
+		uint8_t *e1 = (void *)&elts[pos +
+					    rxq->consumed_strd + 1]->pkt_len;
+		uint8_t *e2 = (void *)&elts[pos +
+					    rxq->consumed_strd + 2]->pkt_len;
+		uint8_t *e3 = (void *)&elts[pos +
+					    rxq->consumed_strd + 3]->pkt_len;
+		void *elts_p = (void *)&elts[pos + rxq->consumed_strd];
+		void *pkts_p = (void *)&pkts[pos];
+
+		/* A.0 do not cross the end of CQ. */
+		mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> ((pkts_n - pos) *
+					    sizeof(uint16_t) * 8) : 0);
+		p0 = (void *)&cq[pos].pkt_info;
+		p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
+		p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
+		p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+		/* B.0 (CQE 3) load a block having op_own. */
+		c3 = vld1q_u64((uint64_t *)(p3 + 48));
+		/* B.0 (CQE 2) load a block having op_own. */
+		c2 = vld1q_u64((uint64_t *)(p2 + 48));
+		/* B.0 (CQE 1) load a block having op_own. */
+		c1 = vld1q_u64((uint64_t *)(p1 + 48));
+		/* B.0 (CQE 0) load a block having op_own. */
+		c0 = vld1q_u64((uint64_t *)(p0 + 48));
+		/* Synchronize for loading the rest of blocks. */
+		rte_cio_rmb();
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
+			rte_prefetch_non_temporal(&cq[next]);
+			rte_prefetch_non_temporal(&cq[next + 1]);
+			rte_prefetch_non_temporal(&cq[next + 2]);
+			rte_prefetch_non_temporal(&cq[next + 3]);
+		}
+		__asm__ volatile (
+		/* B.1 (CQE 3) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+		/* B.2 (CQE 3) move the block having op_own. */
+		"mov v19.16b, %[c3].16b \n\t"
+		/* B.3 (CQE 3) extract 16B fields. */
+		"tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.1 (CQE 2) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+		/* B.4 (CQE 3) adjust CRC length. */
+		"sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
+		/* C.1 (CQE 3) generate final structure for mbuf. */
+		"tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 2) move the block having op_own. */
+		"mov v19.16b, %[c2].16b \n\t"
+		/* B.3 (CQE 2) extract 16B fields. */
+		"tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.1 (CQE 1) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+		/* B.4 (CQE 2) adjust CRC length. */
+		"sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
+		/* C.1 (CQE 2) generate final structure for mbuf. */
+		"tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 1) move the block having op_own. */
+		"mov v19.16b, %[c1].16b \n\t"
+		/* B.3 (CQE 1) extract 16B fields. */
+		"tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.1 (CQE 0) load the rest of blocks. */
+		"ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+		/* B.4 (CQE 1) adjust CRC length. */
+		"sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
+		/* C.1 (CQE 1) generate final structure for mbuf. */
+		"tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
+		/* B.2 (CQE 0) move the block having op_own. */
+		"mov v19.16b, %[c0].16b \n\t"
+		/* A.1 load mbuf pointers. */
+		"ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
+		/* B.3 (CQE 0) extract 16B fields. */
+		"tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+		/* B.4 (CQE 0) adjust CRC length. */
+		"sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
+		/* D.1 extract op_own byte. */
+		"tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
+		/* C.2 (CQE 3) adjust flow mark. */
+		"add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v15.2d}, [%[e3]] \n\t"
+		/* C.2 (CQE 2) adjust flow mark. */
+		"add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v14.2d}, [%[e2]] \n\t"
+		/* C.1 (CQE 0) generate final structure for mbuf. */
+		"tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t"
+		/* C.2 (CQE 1) adjust flow mark. */
+		"add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v13.2d}, [%[e1]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Extract byte_cnt. */
+		"tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t"
+#endif
+		/* Extract ptype_info. */
+		"tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t"
+		/* Extract flow_tag. */
+		"tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t"
+		/* A.2 copy mbuf pointers. */
+		"st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t"
+		/* C.2 (CQE 0) adjust flow mark. */
+		"add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t"
+		/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+		"st1 {v12.2d}, [%[e0]] \n\t"
+		:[op_own]"=&w"(op_own),
+		 [byte_cnt]"=&w"(byte_cnt),
+		 [ptype_info]"=&w"(ptype_info),
+		 [flow_tag]"=&w"(flow_tag)
+		:[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
+		 [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+		 [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
+		 [elts_p]"r"(elts_p),
+		 [pkts_p]"r"(pkts_p),
+		 [cqe_shuf_m]"w"(cqe_shuf_m),
+		 [mb_shuf_m]"w"(mb_shuf_m),
+		 [owner_shuf_m]"w"(owner_shuf_m),
+		 [len_shuf_m]"w"(len_shuf_m),
+		 [ptype_shuf_m]"w"(ptype_shuf_m),
+		 [ftag_shuf_m]"w"(ftag_shuf_m),
+		 [crc_adj]"w"(crc_adj),
+		 [flow_mark_adj]"w"(flow_mark_adj)
+		:"memory",
+		 "v12", "v13", "v14", "v15",
+		 "v16", "v17", "v18", "v19",
+		 "v20", "v21", "v22", "v23",
+		 "v24", "v25");
+		/* D.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = vand_u16(op_own, owner_check);
+		owner_mask = vceq_u16(owner_mask, ownership);
+		/* D.3 get mask for invalidated CQEs. */
+		opcode = vand_u16(op_own, opcode_check);
+		invalid_mask = vceq_u16(opcode_check, opcode);
+		/* E.1 find compressed CQE format. */
+		comp_mask = vand_u16(op_own, format_check);
+		comp_mask = vceq_u16(comp_mask, format_check);
+		/* D.4 mask out beyond boundary. */
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.5 merge invalid_mask with invalid owner. */
+		invalid_mask = vorr_u16(invalid_mask, owner_mask);
+		/* E.2 mask out invalid entries. */
+		comp_mask = vbic_u16(comp_mask, invalid_mask);
+		/* E.3 get the first compressed CQE. */
+		comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+					  comp_mask), 0)) /
+					  (sizeof(uint16_t) * 8);
+		/* D.6 mask out entries after the compressed CQE. */
+		mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> (comp_idx * sizeof(uint16_t) * 8) :
+				   0);
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.7 count non-compressed valid CQEs. */
+		n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+				   invalid_mask), 0)) / (sizeof(uint16_t) * 8);
+		nocmp_n += n;
+		/* D.2 get the final invalid mask. */
+		mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?
+				   -1UL >> (n * sizeof(uint16_t) * 8) : 0);
+		invalid_mask = vorr_u16(invalid_mask, mask);
+		/* D.3 check error in opcode. */
+		opcode = vceq_u16(resp_err_check, opcode);
+		opcode = vbic_u16(opcode, invalid_mask);
+		/* D.4 mark if any error is set */
+		*err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+		/* C.4 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
+					 opcode, &elts[pos]);
+		if (rxq->hw_timestamp) {
+			elts[pos]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p0, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p1, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p2, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+			elts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(
+					container_of(p3, struct mlx5_cqe,
+						     pkt_info)->timestamp);
+		}
+		if (!!rxq->flow_meta_mask) {
+			/* This code is subject for futher optimization. */
+			int32_t offs = rxq->flow_meta_offset;
+
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p0, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p1, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p2, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				container_of(p3, struct mlx5_cqe,
+					     pkt_info)->flow_table_metadata;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+				elts[pos]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+				elts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+				elts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+				elts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
+		return rcvd_pkt;
+	}
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+			     rq_ci, rxq->consumed_strd, false);
+	rxq->cq_ci += nocmp_n;
+	rxq->consumed_strd += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+							&elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+					     rq_ci, rxq->consumed_strd, true);
+			rxq->consumed_strd += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+	rte_cio_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	if (rq_ci != rxq->rq_ci) {
+		rxq->rq_ci = rq_ci;
+		rte_cio_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	}
+	*no_cq = !rcvd_pkt;
+	return rcvd_pkt;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 34e3397115..4054614674 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -56,6 +56,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
 		pkts[pos] = elts[pos];
 }
 
+/**
+ * Copy or attach MPRQ buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param buf
+ *   MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ *   WQE index.
+ * @param strd_idx
+ *   Stride number.
+ * @param comp
+ *   Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		     uint16_t n, struct mlx5_mprq_buf *buf,
+		     uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const unsigned int strd_shift =
+		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+	uint32_t offset;
+	void *addr;
+	int i = 0;
+
+	if (comp) {
+		const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+		struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+		unsigned int pos;
+		uint16_t p = n & -2;
+
+		for (pos = 0; pos < p; pos += 2) {
+			__m128i mbp;
+
+			mbp = _mm_loadu_si128((__m128i *)&elts[pos +
+							rxq->consumed_strd]);
+			_mm_storeu_si128((__m128i *)&pkts[pos], mbp);
+		}
+		if (n & 1)
+			pkts[pos] = elts[pos];
+	}
+
+	for (i = 0; i < n; ++i) {
+		offset = (strd_idx + i) * strd_sz + strd_shift;
+		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+		if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+		    rxq->mprq_repl == NULL) {
+			rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+				   addr, pkts[i]->pkt_len);
+		} else {
+			rte_iova_t buf_iova;
+			struct rte_mbuf_ext_shared_info *shinfo;
+			uint16_t buf_len = strd_sz;
+			void *buf_addr;
+			/* Increment the refcnt of the whole chunk. */
+			rte_atomic16_add_return(&buf->refcnt, 1);
+			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+				    strd_n + 1);
+			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+			/*
+			 * MLX5 device doesn't use iova but it is necessary in a
+			 * case where the Rx packet is transmitted via a
+			 * different PMD.
+			 */
+			buf_iova = rte_mempool_virt2iova(buf) +
+				RTE_PTR_DIFF(buf_addr, buf);
+			shinfo = &buf->shinfos[strd_idx];
+			rte_mbuf_ext_refcnt_set(shinfo, 1);
+			/*
+			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+			 * attaching the stride to mbuf and more offload flags
+			 * will be added below by calling rxq_cq_to_mbuf().
+			 * Other fields will be overwritten.
+			 */
+			rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+						  buf_len, shinfo);
+			/* Set mbuf head-room. */
+			SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+			DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+		}
+	}
+}
+
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -753,4 +842,435 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	return rcvd_pkt;
 }
 
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+		 const unsigned int strd_n)
+{
+	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+	volatile struct mlx5_wqe_data_seg *wqe =
+		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+	void *addr;
+
+	MLX5_ASSERT(rep != NULL);
+	/* Replace MPRQ buf. */
+	(*rxq->mprq_bufs)[rq_idx] = rep;
+	/* Replace WQE. */
+	addr = mlx5_mprq_buf_addr(rep, strd_n);
+	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+	/* If there's only one MR, no need to replace LKey in WQE. */
+	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+	/* Stash a mbuf for next replacement. */
+	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+		rxq->mprq_repl = rep;
+	else
+		rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+	const unsigned int strd_n = 1 << rxq->strd_num_n;
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	const uint16_t e_n = 1 << rxq->elts_n;
+	const uint16_t e_mask = e_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	unsigned int pos;
+	uint64_t n;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_ci = rxq->cq_ci;
+	unsigned int cq_idx = cq_ci & q_mask;
+	unsigned int rq_ci = rxq->rq_ci;
+	unsigned int rq_idx = rq_ci & e_mask;
+	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+	unsigned int elts_idx;
+	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+	const __m128i owner_check =
+		_mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);
+	const __m128i opcode_check =
+		_mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);
+	const __m128i format_check =
+		_mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);
+	const __m128i resp_err_check =
+		_mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	uint32_t rcvd_byte = 0;
+	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+	const __m128i len_shuf_mask =
+		_mm_set_epi8(-1, -1, -1, -1,
+			     -1, -1, -1, -1,
+			     12, 13,  8,  9,
+			      4,  5,  0,  1);
+#endif
+	/* Mask to shuffle from extracted CQE to mbuf. */
+	const __m128i shuf_mask =
+		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
+			     12, 13, 14, 15, /* rss, bswap32 */
+			     10, 11,         /* vlan_tci, bswap16 */
+			      4,  5,         /* data_len, bswap16 */
+			     -1, -1,         /* zero out 2nd half of pkt_len */
+			      4,  5          /* pkt_len, bswap16 */);
+	/* Mask to blend from the last Qword to the first DQword. */
+	const __m128i blend_mask =
+		_mm_set_epi8(-1, -1, -1, -1,
+			     -1, -1, -1, -1,
+			      0,  0,  0,  0,
+			      0,  0,  0, -1);
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+	const __m128i crc_adj =
+		_mm_set_epi16(0, 0, 0, 0, 0,
+			      rxq->crc_present * RTE_ETHER_CRC_LEN,
+			      0,
+			      rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+
+	if (rxq->consumed_strd == strd_n) {
+		/* Replace WQE only if the buffer is still in use. */
+		if (rte_atomic16_read(&buf->refcnt) > 1) {
+			mprq_buf_replace(rxq, rq_ci & e_mask, strd_n);
+			/* Release the old buffer. */
+			mlx5_mprq_buf_free(buf);
+		} else if (unlikely(rxq->mprq_repl == NULL)) {
+			struct mlx5_mprq_buf *rep;
+
+			/*
+			 * Currently, the MPRQ mempool is out of buffer
+			 * and doing memcpy regardless of the size of Rx
+			 * packet. Retry allocation to get back to
+			 * normal.
+			 */
+			if (!rte_mempool_get(rxq->mprq_mp,
+					     (void **)&rep))
+				rxq->mprq_repl = rep;
+		}
+		/* Advance to the next WQE. */
+		rxq->consumed_strd = 0;
+		++rq_ci;
+		buf = (*rxq->mprq_bufs)[rq_ci & e_mask];
+		rxq->rq_repl_thresh = 1;
+	}
+	if (rxq->rq_repl_thresh)
+		mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);
+
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch0(cq);
+	rte_prefetch0(cq + 1);
+	rte_prefetch0(cq + 2);
+	rte_prefetch0(cq + 3);
+	elts_idx = (rq_ci & e_mask) * strd_n +
+		(rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+	elts = &(*rxq->elts)[elts_idx];
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+				     rq_ci, rxq->consumed_strd, true);
+		rxq->consumed_strd += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+		pkts += rcvd_pkt;
+	}
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
+		return rcvd_pkt;
+	}
+	/* At this point, there shouldn't be any remained packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+	/*
+	 * A. load first Qword (8bytes) in one loop.
+	 * B. copy 4 mbuf pointers from elts ring to returing pkts.
+	 * C. load remained CQE data and extract necessary fields.
+	 *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+	 *    following structure:
+	 *        struct {
+	 *          uint8_t  pkt_info;
+	 *          uint8_t  flow_tag[3];
+	 *          uint16_t byte_cnt;
+	 *          uint8_t  rsvd4;
+	 *          uint8_t  op_own;
+	 *          uint16_t hdr_type_etc;
+	 *          uint16_t vlan_info;
+	 *          uint32_t rx_has_res;
+	 *        } c;
+	 * D. fill in mbuf.
+	 * E. get valid CQEs.
+	 * F. find compressed CQE.
+	 */
+	for (pos = 0;
+	     pos < pkts_n;
+	     pos += MLX5_VPMD_DESCS_PER_LOOP) {
+		__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
+		__m128i cqe_tmp1, cqe_tmp2;
+		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+		__m128i op_own, op_own_tmp1, op_own_tmp2;
+		__m128i opcode, owner_mask, invalid_mask;
+		__m128i comp_mask;
+		__m128i mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		__m128i byte_cnt;
+#endif
+		__m128i mbp1, mbp2;
+		__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+		unsigned int p1, p2, p3;
+
+		/* Prefetch next 4 CQEs. */
+		if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+			rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+		}
+		/* A.0 do not cross the end of CQ. */
+		mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
+		mask = _mm_sll_epi64(ones, mask);
+		p = _mm_andnot_si128(mask, p);
+		/* A.1 load cqes. */
+		p3 = _mm_extract_epi16(p, 3);
+		cqes[3] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos + p3].sop_drop_qpn);
+		rte_compiler_barrier();
+		p2 = _mm_extract_epi16(p, 2);
+		cqes[2] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos + p2].sop_drop_qpn);
+		rte_compiler_barrier();
+		/* B.1 load mbuf pointers. */
+		mbp1 = _mm_loadu_si128((__m128i *)&elts[pos +
+						rxq->consumed_strd]);
+		mbp2 = _mm_loadu_si128((__m128i *)&elts[pos +
+						rxq->consumed_strd + 2]);
+		/* A.1 load a block having op_own. */
+		p1 = _mm_extract_epi16(p, 1);
+		cqes[1] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos + p1].sop_drop_qpn);
+		rte_compiler_barrier();
+		cqes[0] = _mm_loadl_epi64((__m128i *)
+					   &cq[pos].sop_drop_qpn);
+		/* B.2 copy mbuf pointers. */
+		_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
+		_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
+		rte_cio_rmb();
+		/* C.1 load remained CQE data and extract necessary fields. */
+		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
+		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
+		cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
+		cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
+		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
+		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
+		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
+		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
+		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
+		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
+		cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
+		cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
+		pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
+		/* C.3 adjust CRC length. */
+		pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
+		pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
+		/* C.4 adjust flow mark. */
+		pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
+		pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
+		_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
+		/* E.1 extract op_own field. */
+		op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+		/* C.1 load remained CQE data and extract necessary fields. */
+		cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
+		cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
+		cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
+		cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
+		cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
+		cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
+		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
+		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
+		cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
+		cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
+		cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
+		cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
+		/* C.2 generate final structure for mbuf with swapping bytes. */
+		pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
+		pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
+		/* C.3 adjust CRC length. */
+		pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
+		pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
+		/* C.4 adjust flow mark. */
+		pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
+		pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
+		/* E.1 extract op_own byte. */
+		op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+		op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
+		/* D.1 fill in mbuf - rx_descriptor_fields1. */
+		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
+		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
+		/* E.2 flip owner bit to mark CQEs from last round. */
+		owner_mask = _mm_and_si128(op_own, owner_check);
+		if (ownership)
+			owner_mask = _mm_xor_si128(owner_mask, owner_check);
+		owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+		owner_mask = _mm_packs_epi32(owner_mask, zero);
+		/* E.3 get mask for invalidated CQEs. */
+		opcode = _mm_and_si128(op_own, opcode_check);
+		invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
+		invalid_mask = _mm_packs_epi32(invalid_mask, zero);
+		/* E.4 mask out beyond boundary. */
+		invalid_mask = _mm_or_si128(invalid_mask, mask);
+		/* E.5 merge invalid_mask with invalid owner. */
+		invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
+		/* F.1 find compressed CQE format. */
+		comp_mask = _mm_and_si128(op_own, format_check);
+		comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
+		comp_mask = _mm_packs_epi32(comp_mask, zero);
+		/* F.2 mask out invalid entries. */
+		comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
+		comp_idx = _mm_cvtsi128_si64(comp_mask);
+		/* F.3 get the first compressed CQE. */
+		comp_idx = comp_idx ?
+				__builtin_ctzll(comp_idx) /
+					(sizeof(uint16_t) * 8) :
+				MLX5_VPMD_DESCS_PER_LOOP;
+		/* E.6 mask out entries after the compressed CQE. */
+		mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
+		mask = _mm_sll_epi64(ones, mask);
+		invalid_mask = _mm_or_si128(invalid_mask, mask);
+		/* E.7 count non-compressed valid CQEs. */
+		n = _mm_cvtsi128_si64(invalid_mask);
+		n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+			MLX5_VPMD_DESCS_PER_LOOP;
+		nocmp_n += n;
+		/* D.2 get the final invalid mask. */
+		mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
+		mask = _mm_sll_epi64(ones, mask);
+		invalid_mask = _mm_or_si128(invalid_mask, mask);
+		/* D.3 check error in opcode. */
+		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
+		opcode = _mm_packs_epi32(opcode, zero);
+		opcode = _mm_andnot_si128(invalid_mask, opcode);
+		/* D.4 mark if any error is set */
+		*err |= _mm_cvtsi128_si64(opcode);
+		/* D.5 fill in mbuf - rearm_data and packet_type. */
+		rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+		if (rxq->hw_timestamp) {
+			pkts[pos]->timestamp =
+				rte_be_to_cpu_64(cq[pos].timestamp);
+			pkts[pos + 1]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p1].timestamp);
+			pkts[pos + 2]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p2].timestamp);
+			pkts[pos + 3]->timestamp =
+				rte_be_to_cpu_64(cq[pos + p3].timestamp);
+		}
+		if (rxq->dynf_meta) {
+			/* This code is subject for futher optimization. */
+			int32_t offs = rxq->flow_meta_offset;
+
+			*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+				cq[pos].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+				cq[pos + p1].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+				cq[pos + p2].flow_table_metadata;
+			*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+				cq[pos + p3].flow_table_metadata;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+				pkts[pos]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+				pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+				pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+			if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+				pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+		}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Add up received bytes count. */
+		byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
+		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+		/*
+		 * Break the loop unless more valid CQE is expected, or if
+		 * there's a compressed CQE.
+		 */
+		if (n != MLX5_VPMD_DESCS_PER_LOOP)
+			break;
+	}
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
+		return rcvd_pkt;
+	}
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+			     rq_ci, rxq->consumed_strd, false);
+	rxq->cq_ci += nocmp_n;
+	rxq->consumed_strd += nocmp_n;
+	rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	rxq->stats.ipackets += nocmp_n;
+	rxq->stats.ibytes += rcvd_byte;
+#endif
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+					&elts[nocmp_n + rxq->consumed_strd]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+					     rq_ci, rxq->consumed_strd, true);
+			rxq->consumed_strd += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+
+	rte_compiler_barrier();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	if (rq_ci != rxq->rq_ci) {
+		rxq->rq_ci = rq_ci;
+		rte_cio_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	}
+	*no_cq = !rcvd_pkt;
+	return rcvd_pkt;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq
  2020-07-19  4:11 [dpdk-dev] [PATCH] net/mlx5: implement vectorized MPRQ burst Alexander Kozyrev
@ 2020-10-21 20:30 ` Alexander Kozyrev
  2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: refactor vectorized Rx routine Alexander Kozyrev
                     ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Alexander Kozyrev @ 2020-10-21 20:30 UTC (permalink / raw)
  To: dev; +Cc: rasland, matan, viacheslavo

The vectorized Rx burst function helps to accelerate the Rx processing
by using SIMD (single instruction, multiple data) extensions for the
multi-buffer packet processing. Pre-allocating multiple mbufs and
filling them in batches of four greatly improves the throughput of the
Rx burst routine.

MPRQ (Multi-Packet Rx Queue) lacks the vectorized version currently.
It works by posting a single large buffer (consisted of  multiple
fixed-size strides) in order to receive multiple packets at once on this
buffer. A Rx packet is then copied to a user-provided mbuf or PMD
attaches the Rx packet to the mbuf by the pointer to an external buffer.

It is proposed to add a vectorized MPRQ Rx routine to speed up the MPRQ
buffer handling as well. It would require pre-allocation of multiple
mbufs every time we exhaust all the strides from the current MPRQ buffer
and switch to a new one. The new mlx5_rx_burst_mprq_vec() routine will
take care of this as well as of decision on whether should we copy or
attach an external buffer for a packet. The batch processing logic won't
be different from the simple vectorized Rx routine.

The new vectorized MPRQ burst function is going to be selected
automatically whenever the mprq_en devarg is specified. If SIMD is not
available on the platform we fall back to the simple MPRQ Rx burst
function. LRO is not supported by the vectorized MPRQ version and fall
back to the regular MPRQ will be performed.


Alexander Kozyrev (2):
  net/mlx5: refactor vectorized Rx routine
  net/mlx5: implement vectorized MPRQ burst

 drivers/net/mlx5/mlx5_devx.c             |  15 +-
 drivers/net/mlx5/mlx5_ethdev.c           |  20 +-
 drivers/net/mlx5/mlx5_rxq.c              |  96 +++---
 drivers/net/mlx5/mlx5_rxtx.c             | 237 ++++---------
 drivers/net/mlx5/mlx5_rxtx.h             | 200 ++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec.c         | 416 ++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec.h         |  55 ---
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 106 ++----
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 103 ++----
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 121 ++-----
 10 files changed, 813 insertions(+), 556 deletions(-)

-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] net/mlx5: refactor vectorized Rx routine
  2020-10-21 20:30 ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Alexander Kozyrev
@ 2020-10-21 20:30   ` Alexander Kozyrev
  2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: implement vectorized MPRQ burst Alexander Kozyrev
  2020-10-22 15:01   ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Raslan Darawsheh
  2 siblings, 0 replies; 5+ messages in thread
From: Alexander Kozyrev @ 2020-10-21 20:30 UTC (permalink / raw)
  To: dev; +Cc: rasland, matan, viacheslavo

Move the main processing cycle into a separate function:
rxq_cq_process_v. Put the regular rxq_burst_v function
to a non-arch specific file. Having all SIMD instructions
in a single reusable block is a first preparatory step to
implement vectorized Rx burst for MPRQ feature.

Pass a pointer to the storage of mbufs directly to the
rxq_copy_mbuf_v instead of calculating the pointer inside
this function. This is needed for the future vectorized Rx
routing which is going to pass a different pointer here.

Calculate the number of packets to replenish inside the
mlx5_rx_replenish_bulk_mbuf. Containing this logic in one
place allows us to do the same for MPRQ case.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Slava Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_rxtx_vec.c         | 104 +++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx_vec.h         |  69 ++++++-------
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 106 ++++----------------
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 103 ++++---------------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 121 +++++------------------
 5 files changed, 204 insertions(+), 299 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index f083038682..aa48775738 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -77,6 +77,110 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	return n;
 }
 
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+	    uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	const uint16_t e_n = 1 << rxq->elts_n;
+	const uint16_t e_mask = e_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	unsigned int cq_idx = rxq->cq_ci & q_mask;
+	unsigned int elts_idx;
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch0(cq);
+	rte_prefetch0(cq + 1);
+	rte_prefetch0(cq + 2);
+	rte_prefetch0(cq + 3);
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	mlx5_rx_replenish_bulk_mbuf(rxq);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		rxq_copy_mbuf_v(&(*rxq->elts)[rxq->rq_pi & e_mask],
+				pkts, rcvd_pkt);
+		rxq->rq_pi += rcvd_pkt;
+		rxq->decompressed -= rcvd_pkt;
+		pkts += rcvd_pkt;
+	}
+	elts_idx = rxq->rq_pi & e_mask;
+	elts = &(*rxq->elts)[elts_idx];
+	/* Not to overflow pkts array. */
+	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
+		return rcvd_pkt;
+	}
+	/* At this point, there shouldn't be any remaining packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+	/* Process all the CQEs */
+	nocmp_n = rxq_cq_process_v(rxq, cq, elts, pkts, pkts_n, err, &comp_idx);
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
+		return rcvd_pkt;
+	}
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	rxq->cq_ci += nocmp_n;
+	rxq->rq_pi += nocmp_n;
+	rcvd_pkt += nocmp_n;
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+							&elts[nocmp_n]);
+		rxq->cq_ci += rxq->decompressed;
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			rxq_copy_mbuf_v(&(*rxq->elts)[rxq->rq_pi & e_mask],
+					&pkts[nocmp_n], n);
+			rxq->rq_pi += n;
+			rcvd_pkt += n;
+			rxq->decompressed -= n;
+		}
+	}
+	rte_io_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	*no_cq = !rcvd_pkt;
+	return rcvd_pkt;
+}
+
 /**
  * DPDK callback for vectorized RX.
  *
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index a8d6c4f411..ce27074b08 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -73,53 +73,54 @@ S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==
  *
  * @param rxq
  *   Pointer to RX queue structure.
- * @param n
- *   Number of buffers to be replenished.
  */
 static inline void
-mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
+mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
 {
 	const uint16_t q_n = 1 << rxq->elts_n;
 	const uint16_t q_mask = q_n - 1;
+	uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);
 	uint16_t elts_idx = rxq->rq_ci & q_mask;
 	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
 	volatile struct mlx5_wqe_data_seg *wq =
 		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
 	unsigned int i;
 
-	MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
-	MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
-	MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
-		    MLX5_VPMD_DESCS_PER_LOOP);
-	/* Not to cross queue end. */
-	n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
-	if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
-		rxq->stats.rx_nombuf += n;
-		return;
-	}
-	for (i = 0; i < n; ++i) {
-		void *buf_addr;
+	if (n >= rxq->rq_repl_thresh) {
+		MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
+		MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
+			    MLX5_VPMD_DESCS_PER_LOOP);
+		/* Not to cross queue end. */
+		n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
+		if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+			rxq->stats.rx_nombuf += n;
+			return;
+		}
+		for (i = 0; i < n; ++i) {
+			void *buf_addr;
 
-		/*
-		 * In order to support the mbufs with external attached
-		 * data buffer we should use the buf_addr pointer instead of
-		 * rte_mbuf_buf_addr(). It touches the mbuf itself and may
-		 * impact the performance.
-		 */
-		buf_addr = elts[i]->buf_addr;
-		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
-					      RTE_PKTMBUF_HEADROOM);
-		/* If there's only one MR, no need to replace LKey in WQE. */
-		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+			/*
+			 * In order to support the mbufs with external attached
+			 * data buffer we should use the buf_addr pointer
+			 * instead of rte_mbuf_buf_addr(). It touches the mbuf
+			 * itself and may impact the performance.
+			 */
+			buf_addr = elts[i]->buf_addr;
+			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+						      RTE_PKTMBUF_HEADROOM);
+			/* If there's a single MR, no need to replace LKey. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)
+				     > 1))
+				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+		}
+		rxq->rq_ci += n;
+		/* Prevent overflowing into consumed mbufs. */
+		elts_idx = rxq->rq_ci & q_mask;
+		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+			(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
+		rte_io_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
 	}
-	rxq->rq_ci += n;
-	/* Prevent overflowing into consumed mbufs. */
-	elts_idx = rxq->rq_ci & q_mask;
-	for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-		(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
-	rte_io_wmb();
-	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index 6bf0c9b540..cf3a795843 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -33,18 +33,16 @@
 /**
  * Store free buffers to RX SW ring.
  *
- * @param rxq
- *   Pointer to RX queue structure.
+ * @param elts
+ *   Pointer to SW ring to be filled.
  * @param pkts
  *   Pointer to array of packets to be stored.
  * @param pkts_n
  *   Number of packets to be stored.
  */
 static inline void
-rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
 {
-	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
-	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
 	unsigned int pos;
 	uint16_t p = n & -2;
 
@@ -550,14 +548,17 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
 		(vector unsigned char *)&pkts[3]->rearm_data);
 }
 
-
 /**
- * Receive burst of packets. An errored completion also consumes a mbuf, but the
- * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
- * before returning to application.
+ * Process a non-compressed completion and fill in mbufs in RX SW ring
+ * with data extracted from the title completion descriptor.
  *
  * @param rxq
  *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a non-compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
  * @param[out] pkts
  *   Array to store received packets.
  * @param pkts_n
@@ -565,28 +566,23 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
  * @param[out] err
  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
  *   packet to handle.
- * @param[out] no_cq
- *  Pointer to a boolean. Set true if no new CQE seen.
+ * @param[out] comp
+ *   Pointer to a index. Set it to the first compressed completion if any.
  *
  * @return
- *   Number of packets received including errors (<= pkts_n).
+ *   Number of CQEs successfully processed.
  */
 static inline uint16_t
-rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint64_t *err, bool *no_cq)
+rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
-	volatile struct mlx5_cqe *cq;
-	struct rte_mbuf **elts;
 	unsigned int pos;
-	uint64_t n;
-	uint16_t repl_n;
+	uint64_t n = 0;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
-	uint16_t rcvd_pkt = 0;
-	unsigned int cq_idx = rxq->cq_ci & q_mask;
-	unsigned int elts_idx;
 	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
 	const vector unsigned char zero = (vector unsigned char){0};
 	const vector unsigned char ones = vec_splat_u8(-1);
@@ -638,41 +634,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	const vector unsigned short cqe_sel_mask2 =
 		(vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
 
-	MLX5_ASSERT(rxq->sges_n == 0);
-	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
-	cq = &(*rxq->cqes)[cq_idx];
-	rte_prefetch0(cq);
-	rte_prefetch0(cq + 1);
-	rte_prefetch0(cq + 2);
-	rte_prefetch0(cq + 3);
-	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
-
-	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
-	if (repl_n >= rxq->rq_repl_thresh)
-		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
-	/* See if there're unreturned mbufs from compressed CQE. */
-	rcvd_pkt = rxq->decompressed;
-	if (rcvd_pkt > 0) {
-		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
-		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
-		rxq->rq_pi += rcvd_pkt;
-		rxq->decompressed -= rcvd_pkt;
-		pkts += rcvd_pkt;
-	}
-	elts_idx = rxq->rq_pi & q_mask;
-	elts = &(*rxq->elts)[elts_idx];
-	/* Not to overflow pkts array. */
-	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
-	/* Not to cross queue end. */
-	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
-	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
-	if (!pkts_n) {
-		*no_cq = !rcvd_pkt;
-		return rcvd_pkt;
-	}
-	/* At this point, there shouldn't be any remaining packets. */
-	MLX5_ASSERT(rxq->decompressed == 0);
-
 	/*
 	 * A. load first Qword (8bytes) in one loop.
 	 * B. copy 4 mbuf pointers from elts ring to returing pkts.
@@ -1101,40 +1062,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
 			break;
 	}
-	/* If no new CQE seen, return without updating cq_db. */
-	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
-		*no_cq = true;
-		return rcvd_pkt;
-	}
-	/* Update the consumer indexes for non-compressed CQEs. */
-	MLX5_ASSERT(nocmp_n <= pkts_n);
-	rxq->cq_ci += nocmp_n;
-	rxq->rq_pi += nocmp_n;
-	rcvd_pkt += nocmp_n;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	rxq->stats.ipackets += nocmp_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
-	/* Decompress the last CQE if compressed. */
-	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
-		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
-		rxq->decompressed =
-			rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
-		/* Return more packets if needed. */
-		if (nocmp_n < pkts_n) {
-			uint16_t n = rxq->decompressed;
-
-			n = RTE_MIN(n, pkts_n - nocmp_n);
-			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
-			rxq->rq_pi += n;
-			rcvd_pkt += n;
-			rxq->decompressed -= n;
-		}
-	}
-	rte_compiler_barrier();
-	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
-	*no_cq = !rcvd_pkt;
-	return rcvd_pkt;
+	if (comp_idx == n)
+		*comp = comp_idx;
+	return nocmp_n;
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index d122dad4fe..47b6692942 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -29,18 +29,16 @@
 /**
  * Store free buffers to RX SW ring.
  *
- * @param rxq
- *   Pointer to RX queue structure.
+ * @param elts
+ *   Pointer to SW ring to be filled.
  * @param pkts
  *   Pointer to array of packets to be stored.
  * @param pkts_n
  *   Number of packets to be stored.
  */
 static inline void
-rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
 {
-	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
-	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
 	unsigned int pos;
 	uint16_t p = n & -2;
 
@@ -368,12 +366,16 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
 }
 
 /**
- * Receive burst of packets. An errored completion also consumes a mbuf, but the
- * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
- * before returning to application.
+ * Process a non-compressed completion and fill in mbufs in RX SW ring
+ * with data extracted from the title completion descriptor.
  *
  * @param rxq
  *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a non-compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
  * @param[out] pkts
  *   Array to store received packets.
  * @param pkts_n
@@ -381,28 +383,23 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
  * @param[out] err
  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
  *   packet to handle.
- * @param[out] no_cq
- *   Pointer to a boolean. Set true if no new CQE seen.
+ * @param[out] comp
+ *   Pointer to a index. Set it to the first compressed completion if any.
  *
  * @return
- *   Number of packets received including errors (<= pkts_n).
+ *   Number of CQEs successfully processed.
  */
 static inline uint16_t
-rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint64_t *err, bool *no_cq)
+rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
-	volatile struct mlx5_cqe *cq;
-	struct rte_mbuf **elts;
 	unsigned int pos;
-	uint64_t n;
-	uint16_t repl_n;
+	uint64_t n = 0;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
-	uint16_t rcvd_pkt = 0;
-	unsigned int cq_idx = rxq->cq_ci & q_mask;
-	unsigned int elts_idx;
 	const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
 	const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
 	const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
@@ -463,39 +460,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	};
 	const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
 
-	MLX5_ASSERT(rxq->sges_n == 0);
-	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
-	cq = &(*rxq->cqes)[cq_idx];
-	rte_prefetch_non_temporal(cq);
-	rte_prefetch_non_temporal(cq + 1);
-	rte_prefetch_non_temporal(cq + 2);
-	rte_prefetch_non_temporal(cq + 3);
-	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
-	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
-	if (repl_n >= rxq->rq_repl_thresh)
-		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
-	/* See if there're unreturned mbufs from compressed CQE. */
-	rcvd_pkt = rxq->decompressed;
-	if (rcvd_pkt > 0) {
-		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
-		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
-		rxq->rq_pi += rcvd_pkt;
-		pkts += rcvd_pkt;
-		rxq->decompressed -= rcvd_pkt;
-	}
-	elts_idx = rxq->rq_pi & q_mask;
-	elts = &(*rxq->elts)[elts_idx];
-	/* Not to overflow pkts array. */
-	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
-	/* Not to cross queue end. */
-	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
-	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
-	if (!pkts_n) {
-		*no_cq = !rcvd_pkt;
-		return rcvd_pkt;
-	}
-	/* At this point, there shouldn't be any remained packets. */
-	MLX5_ASSERT(rxq->decompressed == 0);
 	/*
 	 * Note that vectors have reverse order - {v3, v2, v1, v0}, because
 	 * there's no instruction to count trailing zeros. __builtin_clzl() is
@@ -773,40 +737,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
 			break;
 	}
-	/* If no new CQE seen, return without updating cq_db. */
-	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
-		*no_cq = true;
-		return rcvd_pkt;
-	}
-	/* Update the consumer indexes for non-compressed CQEs. */
-	MLX5_ASSERT(nocmp_n <= pkts_n);
-	rxq->cq_ci += nocmp_n;
-	rxq->rq_pi += nocmp_n;
-	rcvd_pkt += nocmp_n;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	rxq->stats.ipackets += nocmp_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
-	/* Decompress the last CQE if compressed. */
-	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
-		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
-		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
-							&elts[nocmp_n]);
-		/* Return more packets if needed. */
-		if (nocmp_n < pkts_n) {
-			uint16_t n = rxq->decompressed;
-
-			n = RTE_MIN(n, pkts_n - nocmp_n);
-			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
-			rxq->rq_pi += n;
-			rcvd_pkt += n;
-			rxq->decompressed -= n;
-		}
-	}
-	rte_io_wmb();
-	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
-	*no_cq = !rcvd_pkt;
-	return rcvd_pkt;
+	if (comp_idx == n)
+		*comp = comp_idx;
+	return nocmp_n;
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 0bbcbeefff..59662fa12d 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -31,18 +31,16 @@
 /**
  * Store free buffers to RX SW ring.
  *
- * @param rxq
- *   Pointer to RX queue structure.
+ * @param elts
+ *   Pointer to SW ring to be filled.
  * @param pkts
  *   Pointer to array of packets to be stored.
  * @param pkts_n
  *   Number of packets to be stored.
  */
 static inline void
-rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
+rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
 {
-	const uint16_t q_mask = (1 << rxq->elts_n) - 1;
-	struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask];
 	unsigned int pos;
 	uint16_t p = n & -2;
 
@@ -227,7 +225,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	rxq->stats.ipackets += mcqe_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
-	rxq->cq_ci += mcqe_n;
 	return mcqe_n;
 }
 
@@ -293,9 +290,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 	pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
 	ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
 	if (rxq->mark) {
-		const __m128i pinfo_ft_mask =
-			_mm_set_epi32(0xffffff00, 0xffffff00,
-				      0xffffff00, 0xffffff00);
+		const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00);
 		const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR);
 		__m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID);
 		__m128i flow_tag, invalid_mask;
@@ -373,12 +368,16 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 }
 
 /**
- * Receive burst of packets. An errored completion also consumes a mbuf, but the
- * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
- * before returning to application.
+ * Process a non-compressed completion and fill in mbufs in RX SW ring
+ * with data extracted from the title completion descriptor.
  *
  * @param rxq
  *   Pointer to RX queue structure.
+ * @param cq
+ *   Pointer to completion array having a non-compressed completion at first.
+ * @param elts
+ *   Pointer to SW ring to be filled. The first mbuf has to be pre-built from
+ *   the title completion descriptor to be copied to the rest of mbufs.
  * @param[out] pkts
  *   Array to store received packets.
  * @param pkts_n
@@ -386,37 +385,28 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
  * @param[out] err
  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
  *   packet to handle.
- * @param[out] no_cq
- *   Pointer to a boolean. Set true if no new CQE seen.
+ * @param[out] comp
+ *   Pointer to a index. Set it to the first compressed completion if any.
  *
  * @return
- *   Number of packets received including errors (<= pkts_n).
+ *   Number of CQEs successfully processed.
  */
 static inline uint16_t
-rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint64_t *err, bool *no_cq)
+rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
+		 struct rte_mbuf **elts, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
-	volatile struct mlx5_cqe *cq;
-	struct rte_mbuf **elts;
 	unsigned int pos;
-	uint64_t n;
-	uint16_t repl_n;
+	uint64_t n = 0;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
-	uint16_t rcvd_pkt = 0;
-	unsigned int cq_idx = rxq->cq_ci & q_mask;
-	unsigned int elts_idx;
 	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
-	const __m128i owner_check =
-		_mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);
-	const __m128i opcode_check =
-		_mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);
-	const __m128i format_check =
-		_mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);
-	const __m128i resp_err_check =
-		_mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);
+	const __m128i owner_check =	_mm_set1_epi64x(0x0100000001000000LL);
+	const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
+	const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
+	const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	uint32_t rcvd_byte = 0;
 	/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
@@ -448,40 +438,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 			      0,
 			      rxq->crc_present * RTE_ETHER_CRC_LEN);
 	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
-
-	MLX5_ASSERT(rxq->sges_n == 0);
-	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
-	cq = &(*rxq->cqes)[cq_idx];
-	rte_prefetch0(cq);
-	rte_prefetch0(cq + 1);
-	rte_prefetch0(cq + 2);
-	rte_prefetch0(cq + 3);
-	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
-	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
-	if (repl_n >= rxq->rq_repl_thresh)
-		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
-	/* See if there're unreturned mbufs from compressed CQE. */
-	rcvd_pkt = rxq->decompressed;
-	if (rcvd_pkt > 0) {
-		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
-		rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt);
-		rxq->rq_pi += rcvd_pkt;
-		rxq->decompressed -= rcvd_pkt;
-		pkts += rcvd_pkt;
-	}
-	elts_idx = rxq->rq_pi & q_mask;
-	elts = &(*rxq->elts)[elts_idx];
-	/* Not to overflow pkts array. */
-	pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP);
-	/* Not to cross queue end. */
-	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
-	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
-	if (!pkts_n) {
-		*no_cq = !rcvd_pkt;
-		return rcvd_pkt;
-	}
-	/* At this point, there shouldn't be any remained packets. */
-	MLX5_ASSERT(rxq->decompressed == 0);
 	/*
 	 * A. load first Qword (8bytes) in one loop.
 	 * B. copy 4 mbuf pointers from elts ring to returing pkts.
@@ -718,40 +674,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 		if (n != MLX5_VPMD_DESCS_PER_LOOP)
 			break;
 	}
-	/* If no new CQE seen, return without updating cq_db. */
-	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
-		*no_cq = true;
-		return rcvd_pkt;
-	}
-	/* Update the consumer indexes for non-compressed CQEs. */
-	MLX5_ASSERT(nocmp_n <= pkts_n);
-	rxq->cq_ci += nocmp_n;
-	rxq->rq_pi += nocmp_n;
-	rcvd_pkt += nocmp_n;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	rxq->stats.ipackets += nocmp_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
-	/* Decompress the last CQE if compressed. */
-	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
-		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
-		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
-							&elts[nocmp_n]);
-		/* Return more packets if needed. */
-		if (nocmp_n < pkts_n) {
-			uint16_t n = rxq->decompressed;
-
-			n = RTE_MIN(n, pkts_n - nocmp_n);
-			rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n);
-			rxq->rq_pi += n;
-			rcvd_pkt += n;
-			rxq->decompressed -= n;
-		}
-	}
-	rte_compiler_barrier();
-	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
-	*no_cq = !rcvd_pkt;
-	return rcvd_pkt;
+	if (comp_idx == n)
+		*comp = comp_idx;
+	return nocmp_n;
 }
 
 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] net/mlx5: implement vectorized MPRQ burst
  2020-10-21 20:30 ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Alexander Kozyrev
  2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: refactor vectorized Rx routine Alexander Kozyrev
@ 2020-10-21 20:30   ` Alexander Kozyrev
  2020-10-22 15:01   ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Raslan Darawsheh
  2 siblings, 0 replies; 5+ messages in thread
From: Alexander Kozyrev @ 2020-10-21 20:30 UTC (permalink / raw)
  To: dev; +Cc: rasland, matan, viacheslavo

MPRQ (Multi-Packet Rx Queue) processes one packet at a time using
simple scalar instructions. MPRQ works by posting a single large buffer
(consisted of multiple fixed-size strides) in order to receive multiple
packets at once on this buffer. A Rx packet is then copied to a
user-provided mbuf or PMD attaches the Rx packet to the mbuf by the
pointer to an external buffer.

There is an opportunity to speed up the packet receiving by processing
4 packets simultaneously using SIMD (single instruction, multiple data)
extensions. Allocate mbufs in batches for every MPRQ buffer and process
the packets in groups of 4 until all the strides are exhausted. Then
switch to another MPRQ buffer and repeat the process over again.

The vectorized MPRQ burst routine is engaged automatically in case
the mprq_en=1 devarg is specified and the vectorization is not disabled
explicitly by providing rx_vec_en=0 devarg. There is a limitation:
LRO is not supported and scalar MPRQ is selected if it is on.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Slava Ovsiienko <viacheslavo@nvidia.com>
---
 drivers/net/mlx5/mlx5_devx.c     |  15 +-
 drivers/net/mlx5/mlx5_ethdev.c   |  20 +-
 drivers/net/mlx5/mlx5_rxq.c      |  96 ++++++----
 drivers/net/mlx5/mlx5_rxtx.c     | 237 ++++++-----------------
 drivers/net/mlx5/mlx5_rxtx.h     | 200 +++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec.c | 312 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec.h |  56 ------
 7 files changed, 644 insertions(+), 292 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 11bda32557..0c99fe7519 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -437,10 +437,17 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)
 	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
 	    !rxq_data->lro) {
 		cq_attr.cqe_comp_en = 1u;
-		cq_attr.mini_cqe_res_format =
-				mlx5_rxq_mprq_enabled(rxq_data) ?
-					MLX5_CQE_RESP_FORMAT_CSUM_STRIDX :
-					MLX5_CQE_RESP_FORMAT_HASH;
+		/*
+		 * Select CSUM miniCQE format only for non-vectorized MPRQ
+		 * Rx burst, use HASH miniCQE format for everything else.
+		 */
+		if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
+			mlx5_rxq_mprq_enabled(rxq_data))
+			cq_attr.mini_cqe_res_format =
+				MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+		else
+			cq_attr.mini_cqe_res_format =
+				MLX5_CQE_RESP_FORMAT_HASH;
 		/*
 		 * For vectorized Rx, it must not be doubled in order to
 		 * make cq_ci and rq_ci aligned.
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 7631f644b2..c70cd301b5 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 
 	if (dev->rx_pkt_burst == mlx5_rx_burst ||
 	    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
-	    dev->rx_pkt_burst == mlx5_rx_burst_vec)
+	    dev->rx_pkt_burst == mlx5_rx_burst_vec ||
+	    dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)
 		return ptypes;
 	return NULL;
 }
@@ -480,11 +481,22 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
 
 	MLX5_ASSERT(dev != NULL);
 	if (mlx5_check_vec_rx_support(dev) > 0) {
-		rx_pkt_burst = mlx5_rx_burst_vec;
-		DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
-			dev->data->port_id);
+		if (mlx5_mprq_enabled(dev)) {
+			rx_pkt_burst = mlx5_rx_burst_mprq_vec;
+			DRV_LOG(DEBUG, "port %u selected vectorized"
+				" MPRQ Rx function", dev->data->port_id);
+		} else {
+			rx_pkt_burst = mlx5_rx_burst_vec;
+			DRV_LOG(DEBUG, "port %u selected vectorized"
+				" SPRQ Rx function", dev->data->port_id);
+		}
 	} else if (mlx5_mprq_enabled(dev)) {
 		rx_pkt_burst = mlx5_rx_burst_mprq;
+		DRV_LOG(DEBUG, "port %u selected MPRQ Rx function",
+			dev->data->port_id);
+	} else {
+		DRV_LOG(DEBUG, "port %u selected SPRQ Rx function",
+			dev->data->port_id);
 	}
 	return rx_pkt_burst;
 }
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index e1783ba397..ca1625eac6 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -173,7 +173,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 			rxq->mprq_repl = buf;
 	}
 	DRV_LOG(DEBUG,
-		"port %u Rx queue %u allocated and configured %u segments",
+		"port %u MPRQ queue %u allocated and configured %u segments",
 		rxq->port_id, rxq->idx, wqe_n);
 	return 0;
 error:
@@ -185,7 +185,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 					(*rxq->mprq_bufs)[i]);
 		(*rxq->mprq_bufs)[i] = NULL;
 	}
-	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+	DRV_LOG(DEBUG, "port %u MPRQ queue %u failed, freed everything",
 		rxq->port_id, rxq->idx);
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -204,7 +204,9 @@ static int
 rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
-	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
+	unsigned int elts_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+		(1 << rxq_ctrl->rxq.elts_n) * (1 << rxq_ctrl->rxq.strd_num_n) :
+		(1 << rxq_ctrl->rxq.elts_n);
 	unsigned int i;
 	int err;
 
@@ -262,7 +264,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
 	}
 	DRV_LOG(DEBUG,
-		"port %u Rx queue %u allocated and configured %u segments"
+		"port %u SPRQ queue %u allocated and configured %u segments"
 		" (max %u packets)",
 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,
 		elts_n / (1 << rxq_ctrl->rxq.sges_n));
@@ -275,7 +277,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
 		(*rxq_ctrl->rxq.elts)[i] = NULL;
 	}
-	DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+	DRV_LOG(DEBUG, "port %u SPRQ queue %u failed, freed everything",
 		PORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -293,8 +295,15 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 int
 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
-	return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
-	       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
+	int ret = 0;
+
+	/**
+	 * For MPRQ we need to allocate both MPRQ buffers
+	 * for WQEs and simple mbufs for vector processing.
+	 */
+	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+		ret = rxq_alloc_elts_mprq(rxq_ctrl);
+	return (ret || rxq_alloc_elts_sprq(rxq_ctrl));
 }
 
 /**
@@ -309,11 +318,10 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
 	uint16_t i;
 
-	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing WRs",
-		rxq->port_id, rxq->idx);
+	DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u freeing %d WRs",
+		rxq->port_id, rxq->idx, (1u << rxq->elts_n));
 	if (rxq->mprq_bufs == NULL)
 		return;
-	MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
 	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
 		if ((*rxq->mprq_bufs)[i] != NULL)
 			mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
@@ -335,25 +343,27 @@ static void
 rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
-	const uint16_t q_n = (1 << rxq->elts_n);
+	const uint16_t q_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
+		(1 << rxq->elts_n) * (1 << rxq->strd_num_n) :
+		(1 << rxq->elts_n);
 	const uint16_t q_mask = q_n - 1;
 	uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
 	uint16_t i;
 
-	DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
-		PORT_ID(rxq_ctrl->priv), rxq->idx);
+	DRV_LOG(DEBUG, "port %u Rx queue %u freeing %d WRs",
+		PORT_ID(rxq_ctrl->priv), rxq->idx, q_n);
 	if (rxq->elts == NULL)
 		return;
 	/**
-	 * Some mbuf in the Ring belongs to the application.  They cannot be
-	 * freed.
+	 * Some mbuf in the Ring belongs to the application.
+	 * They cannot be freed.
 	 */
 	if (mlx5_rxq_check_vec_support(rxq) > 0) {
 		for (i = 0; i < used; ++i)
 			(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
 		rxq->rq_pi = rxq->rq_ci;
 	}
-	for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+	for (i = 0; i != q_n; ++i) {
 		if ((*rxq->elts)[i] != NULL)
 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
 		(*rxq->elts)[i] = NULL;
@@ -369,10 +379,13 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 static void
 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
+	/*
+	 * For MPRQ we need to allocate both MPRQ buffers
+	 * for WQEs and simple mbufs for vector processing.
+	 */
 	if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
 		rxq_free_elts_mprq(rxq_ctrl);
-	else
-		rxq_free_elts_sprq(rxq_ctrl);
+	rxq_free_elts_sprq(rxq_ctrl);
 }
 
 /**
@@ -1334,20 +1347,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_rxq_ctrl *tmpl;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-	unsigned int mprq_stride_nums;
-	unsigned int mprq_stride_size;
-	unsigned int mprq_stride_cap;
 	struct mlx5_dev_config *config = &priv->config;
-	/*
-	 * Always allocate extra slots, even if eventually
-	 * the vector Rx will not be used.
-	 */
-	uint16_t desc_n =
-		desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
 	uint64_t offloads = conf->offloads |
 			   dev->data->dev_conf.rxmode.offloads;
 	unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
-	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
 	unsigned int max_rx_pkt_len = lro_on_queue ?
 			dev->data->dev_conf.rxmode.max_lro_pkt_size :
 			dev->data->dev_conf.rxmode.max_rx_pkt_len;
@@ -1355,6 +1358,21 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 							RTE_PKTMBUF_HEADROOM;
 	unsigned int max_lro_size = 0;
 	unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+	const int mprq_en = mlx5_check_mprq_support(dev) > 0;
+	unsigned int mprq_stride_nums = config->mprq.stride_num_n ?
+		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
+	unsigned int mprq_stride_size = non_scatter_min_mbuf_size <=
+		(1U << config->mprq.max_stride_size_n) ?
+		log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
+	unsigned int mprq_stride_cap = (config->mprq.stride_num_n ?
+		(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
+		(config->mprq.stride_size_n ?
+		(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
+	/*
+	 * Always allocate extra slots, even if eventually
+	 * the vector Rx will not be used.
+	 */
+	uint16_t desc_n = desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
 
 	if (non_scatter_min_mbuf_size > mb_len && !(offloads &
 						    DEV_RX_OFFLOAD_SCATTER)) {
@@ -1366,8 +1384,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		rte_errno = ENOSPC;
 		return NULL;
 	}
-	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
-			   desc_n * sizeof(struct rte_mbuf *), 0, socket);
+	tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+		sizeof(*tmpl) + desc_n * sizeof(struct rte_mbuf *) +
+		(desc >> mprq_stride_nums) * sizeof(struct mlx5_mprq_buf *),
+		0, socket);
+
 	if (!tmpl) {
 		rte_errno = ENOMEM;
 		return NULL;
@@ -1381,15 +1402,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
-	mprq_stride_nums = config->mprq.stride_num_n ?
-		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
-	mprq_stride_size = non_scatter_min_mbuf_size <=
-		(1U << config->mprq.max_stride_size_n) ?
-		log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
-	mprq_stride_cap = (config->mprq.stride_num_n ?
-		(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
-			(config->mprq.stride_size_n ?
-		(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
 	/*
 	 * This Rx queue can be configured as a Multi-Packet RQ if all of the
 	 * following conditions are met:
@@ -1535,9 +1547,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->rxq.mp = mp;
 	tmpl->rxq.elts_n = log2above(desc);
 	tmpl->rxq.rq_repl_thresh =
-		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
+		MLX5_VPMD_RXQ_RPLNSH_THRESH(desc_n);
 	tmpl->rxq.elts =
-		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+		(struct rte_mbuf *(*)[desc_n])(tmpl + 1);
+	tmpl->rxq.mprq_bufs =
+		(struct mlx5_mprq_buf *(*)[desc])(*tmpl->rxq.elts + desc_n);
 #ifndef RTE_ARCH_64
 	tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;
 #endif
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b530ff421f..dbb427b5a8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -19,12 +19,12 @@
 #include <mlx5_prm.h>
 #include <mlx5_common.h>
 
+#include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
 #include "mlx5.h"
 #include "mlx5_mr.h"
 #include "mlx5_utils.h"
 #include "mlx5_rxtx.h"
-#include "mlx5_autoconf.h"
 
 /* TX burst subroutines return codes. */
 enum mlx5_txcmp_code {
@@ -93,10 +93,6 @@ static __rte_always_inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
 	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
 
-static __rte_always_inline void
-mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
-		 const unsigned int strd_n);
-
 static int
 mlx5_queue_state_modify(struct rte_eth_dev *dev,
 			struct mlx5_mp_arg_queue_state_modify *sm);
@@ -584,7 +580,14 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
 		       struct rte_eth_burst_mode *mode)
 {
 	eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq;
 
+	rxq = (*priv->rxqs)[rx_queue_id];
+	if (!rxq) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
 	if (pkt_burst == mlx5_rx_burst) {
 		snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
 	} else if (pkt_burst == mlx5_rx_burst_mprq) {
@@ -598,6 +601,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
 		snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
 #else
 		return -EINVAL;
+#endif
+	} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
+#if defined RTE_ARCH_X86_64
+		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
+#elif defined RTE_ARCH_ARM64
+		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+		snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
+#else
+		return -EINVAL;
 #endif
 	} else {
 		return -EINVAL;
@@ -866,6 +879,8 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
 	rxq->zip = (struct rxq_zip){
 		.ai = 0,
 	};
+	rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
+		(wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0;
 	/* Update doorbell counter. */
 	rxq->rq_ci = wqe_n >> rxq->sges_n;
 	rte_io_wmb();
@@ -969,7 +984,8 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 {
 	const uint16_t cqe_n = 1 << rxq->cqe_n;
 	const uint16_t cqe_mask = cqe_n - 1;
-	const unsigned int wqe_n = 1 << rxq->elts_n;
+	const uint16_t wqe_n = 1 << rxq->elts_n;
+	const uint16_t strd_n = 1 << rxq->strd_num_n;
 	struct mlx5_rxq_ctrl *rxq_ctrl =
 			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 	union {
@@ -1033,21 +1049,27 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 						    &sm))
 				return -1;
 			if (vec) {
-				const uint16_t q_mask = wqe_n - 1;
-				uint16_t elt_idx;
+				const uint32_t elts_n =
+					mlx5_rxq_mprq_enabled(rxq) ?
+					wqe_n * strd_n : wqe_n;
+				const uint32_t e_mask = elts_n - 1;
+				uint32_t elts_ci =
+					mlx5_rxq_mprq_enabled(rxq) ?
+					rxq->elts_ci : rxq->rq_ci;
+				uint32_t elt_idx;
 				struct rte_mbuf **elt;
 				int i;
-				unsigned int n = wqe_n - (rxq->rq_ci -
+				unsigned int n = elts_n - (elts_ci -
 							  rxq->rq_pi);
 
 				for (i = 0; i < (int)n; ++i) {
-					elt_idx = (rxq->rq_ci + i) & q_mask;
+					elt_idx = (elts_ci + i) & e_mask;
 					elt = &(*rxq->elts)[elt_idx];
 					*elt = rte_mbuf_raw_alloc(rxq->mp);
 					if (!*elt) {
 						for (i--; i >= 0; --i) {
-							elt_idx = (rxq->rq_ci +
-								   i) & q_mask;
+							elt_idx = (elts_ci +
+								   i) & elts_n;
 							elt = &(*rxq->elts)
 								[elt_idx];
 							rte_pktmbuf_free_seg
@@ -1056,7 +1078,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 						return -1;
 					}
 				}
-				for (i = 0; i < (int)wqe_n; ++i) {
+				for (i = 0; i < (int)elts_n; ++i) {
 					elt = &(*rxq->elts)[i];
 					DATA_LEN(*elt) =
 						(uint16_t)((*elt)->buf_len -
@@ -1064,7 +1086,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 				}
 				/* Padding with a fake mbuf for vec Rx. */
 				for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-					(*rxq->elts)[wqe_n + i] =
+					(*rxq->elts)[elts_n + i] =
 								&rxq->fake_mbuf;
 			}
 			mlx5_rxq_initialize(rxq);
@@ -1545,31 +1567,6 @@ mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
 	mlx5_mprq_buf_free_cb(NULL, buf);
 }
 
-static inline void
-mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
-		 const unsigned int strd_n)
-{
-	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
-	volatile struct mlx5_wqe_data_seg *wqe =
-		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
-	void *addr;
-
-	MLX5_ASSERT(rep != NULL);
-	/* Replace MPRQ buf. */
-	(*rxq->mprq_bufs)[rq_idx] = rep;
-	/* Replace WQE. */
-	addr = mlx5_mprq_buf_addr(rep, strd_n);
-	wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
-	/* If there's only one MR, no need to replace LKey in WQE. */
-	if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-		wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
-	/* Stash a mbuf for next replacement. */
-	if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
-		rxq->mprq_repl = rep;
-	else
-		rxq->mprq_repl = NULL;
-}
-
 /**
  * DPDK callback for RX with Multi-Packet RQ support.
  *
@@ -1587,12 +1584,9 @@ uint16_t
 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct mlx5_rxq_data *rxq = dpdk_rxq;
-	const unsigned int strd_n = 1 << rxq->strd_num_n;
-	const unsigned int strd_sz = 1 << rxq->strd_sz_n;
-	const unsigned int strd_shift =
-		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
-	const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
-	const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
+	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
+	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
 	unsigned int i = 0;
 	uint32_t rq_ci = rxq->rq_ci;
@@ -1601,37 +1595,18 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 	while (i < pkts_n) {
 		struct rte_mbuf *pkt;
-		void *addr;
 		int ret;
 		uint32_t len;
 		uint16_t strd_cnt;
 		uint16_t strd_idx;
-		uint32_t offset;
 		uint32_t byte_cnt;
-		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
+		enum mlx5_rqx_code rxq_code;
 
 		if (consumed_strd == strd_n) {
-			/* Replace WQE only if the buffer is still in use. */
-			if (__atomic_load_n(&buf->refcnt,
-					    __ATOMIC_RELAXED) > 1) {
-				mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
-				/* Release the old buffer. */
-				mlx5_mprq_buf_free(buf);
-			} else if (unlikely(rxq->mprq_repl == NULL)) {
-				struct mlx5_mprq_buf *rep;
-
-				/*
-				 * Currently, the MPRQ mempool is out of buffer
-				 * and doing memcpy regardless of the size of Rx
-				 * packet. Retry allocation to get back to
-				 * normal.
-				 */
-				if (!rte_mempool_get(rxq->mprq_mp,
-						     (void **)&rep))
-					rxq->mprq_repl = rep;
-			}
+			/* Replace WQE if the buffer is still in use. */
+			mprq_buf_replace(rxq, rq_ci & wq_mask);
 			/* Advance to the next WQE. */
 			consumed_strd = 0;
 			++rq_ci;
@@ -1667,122 +1642,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
 		if (rxq->crc_present)
 			len -= RTE_ETHER_CRC_LEN;
-		offset = strd_idx * strd_sz + strd_shift;
-		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
-		hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
-		/*
-		 * Memcpy packets to the target mbuf if:
-		 * - The size of packet is smaller than mprq_max_memcpy_len.
-		 * - Out of buffer in the Mempool for Multi-Packet RQ.
-		 * - The packet's stride overlaps a headroom and scatter is off.
-		 */
-		if (len <= rxq->mprq_max_memcpy_len ||
-		    rxq->mprq_repl == NULL ||
-		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
-			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
-				DATA_LEN(pkt) = len;
-			} else if (rxq->strd_scatter_en) {
-				struct rte_mbuf *prev = pkt;
-				uint32_t seg_len =
-					RTE_MIN(rte_pktmbuf_tailroom(pkt), len);
-				uint32_t rem_len = len - seg_len;
-
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, seg_len);
-				DATA_LEN(pkt) = seg_len;
-				while (rem_len) {
-					struct rte_mbuf *next =
-						rte_pktmbuf_alloc(rxq->mp);
-
-					if (unlikely(next == NULL)) {
-						rte_pktmbuf_free(pkt);
-						++rxq->stats.rx_nombuf;
-						goto out;
-					}
-					NEXT(prev) = next;
-					SET_DATA_OFF(next, 0);
-					addr = RTE_PTR_ADD(addr, seg_len);
-					seg_len = RTE_MIN
-						(rte_pktmbuf_tailroom(next),
-						 rem_len);
-					rte_memcpy
-						(rte_pktmbuf_mtod(next, void *),
-						 addr, seg_len);
-					DATA_LEN(next) = seg_len;
-					rem_len -= seg_len;
-					prev = next;
-					++NB_SEGS(pkt);
-				}
-			} else {
-				rte_pktmbuf_free_seg(pkt);
+		rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
+					   strd_idx, strd_cnt);
+		if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
+			rte_pktmbuf_free_seg(pkt);
+			if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
 				++rxq->stats.idropped;
 				continue;
 			}
-		} else {
-			rte_iova_t buf_iova;
-			struct rte_mbuf_ext_shared_info *shinfo;
-			uint16_t buf_len = strd_cnt * strd_sz;
-			void *buf_addr;
-
-			/* Increment the refcnt of the whole chunk. */
-			__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
-			MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
-				    __ATOMIC_RELAXED) <= strd_n + 1);
-			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
-			/*
-			 * MLX5 device doesn't use iova but it is necessary in a
-			 * case where the Rx packet is transmitted via a
-			 * different PMD.
-			 */
-			buf_iova = rte_mempool_virt2iova(buf) +
-				   RTE_PTR_DIFF(buf_addr, buf);
-			shinfo = &buf->shinfos[strd_idx];
-			rte_mbuf_ext_refcnt_set(shinfo, 1);
-			/*
-			 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
-			 * attaching the stride to mbuf and more offload flags
-			 * will be added below by calling rxq_cq_to_mbuf().
-			 * Other fields will be overwritten.
-			 */
-			rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
-						  buf_len, shinfo);
-			/* Set mbuf head-room. */
-			SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
-			MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
-			MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=
-				len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
-			DATA_LEN(pkt) = len;
-			/*
-			 * Copy the last fragment of a packet (up to headroom
-			 * size bytes) in case there is a stride overlap with
-			 * a next packet's headroom. Allocate a separate mbuf
-			 * to store this fragment and link it. Scatter is on.
-			 */
-			if (hdrm_overlap > 0) {
-				MLX5_ASSERT(rxq->strd_scatter_en);
-				struct rte_mbuf *seg =
-					rte_pktmbuf_alloc(rxq->mp);
-
-				if (unlikely(seg == NULL)) {
-					rte_pktmbuf_free_seg(pkt);
-					++rxq->stats.rx_nombuf;
-					break;
-				}
-				SET_DATA_OFF(seg, 0);
-				rte_memcpy(rte_pktmbuf_mtod(seg, void *),
-					RTE_PTR_ADD(addr, len - hdrm_overlap),
-					hdrm_overlap);
-				DATA_LEN(seg) = hdrm_overlap;
-				DATA_LEN(pkt) = len - hdrm_overlap;
-				NEXT(pkt) = seg;
-				NB_SEGS(pkt) = 2;
+			if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
+				++rxq->stats.rx_nombuf;
+				break;
 			}
 		}
 		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
 		if (cqe->lro_num_seg > 1) {
-			mlx5_lro_update_hdr(addr, cqe, len);
+			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
+					    cqe, len);
 			pkt->ol_flags |= PKT_RX_LRO;
 			pkt->tso_segsz = len / cqe->lro_num_seg;
 		}
@@ -1796,7 +1672,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		*(pkts++) = pkt;
 		++i;
 	}
-out:
 	/* Update the consumer indexes. */
 	rxq->consumed_strd = consumed_strd;
 	rte_io_wmb();
@@ -1878,6 +1753,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
 	return 0;
 }
 
+__rte_weak uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
+		       struct rte_mbuf **pkts __rte_unused,
+		       uint16_t pkts_n __rte_unused)
+{
+	return 0;
+}
+
 __rte_weak int
 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index b243b6f28c..0eafa22d63 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -30,6 +30,7 @@
 #include "mlx5_utils.h"
 #include "mlx5.h"
 #include "mlx5_autoconf.h"
+#include "mlx5_mr.h"
 
 /* Support tunnel matching. */
 #define MLX5_FLOW_TUNNEL 10
@@ -94,6 +95,12 @@ enum mlx5_rxq_err_state {
 	MLX5_RXQ_ERR_STATE_NEED_READY,
 };
 
+enum mlx5_rqx_code {
+	MLX5_RXQ_CODE_EXIT = 0,
+	MLX5_RXQ_CODE_NOMBUF,
+	MLX5_RXQ_CODE_DROPPED,
+};
+
 /* RX queue descriptor. */
 struct mlx5_rxq_data {
 	unsigned int csum:1; /* Enable checksum offloading. */
@@ -116,6 +123,7 @@ struct mlx5_rxq_data {
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t port_id;
+	uint32_t elts_ci;
 	uint32_t rq_ci;
 	uint16_t consumed_strd; /* Number of consumed strides in WQE. */
 	uint32_t rq_pi;
@@ -130,11 +138,8 @@ struct mlx5_rxq_data {
 	uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
 	volatile void *wqes;
 	volatile struct mlx5_cqe(*cqes)[];
-	RTE_STD_C11
-	union  {
-		struct rte_mbuf *(*elts)[];
-		struct mlx5_mprq_buf *(*mprq_bufs)[];
-	};
+	struct rte_mbuf *(*elts)[];
+	struct mlx5_mprq_buf *(*mprq_bufs)[];
 	struct rte_mempool *mp;
 	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
 	struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
@@ -421,6 +426,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 			   uint16_t pkts_n);
+uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+				uint16_t pkts_n);
 
 /* mlx5_mr.c */
 
@@ -681,4 +688,187 @@ mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)
 	return ci;
 }
 
+/**
+ * Replace MPRQ buffer.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param rq_idx
+ *   RQ index to replace.
+ */
+static __rte_always_inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx)
+{
+	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+	volatile struct mlx5_wqe_data_seg *wqe =
+		&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+	void *addr;
+
+	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) > 1) {
+		MLX5_ASSERT(rep != NULL);
+		/* Replace MPRQ buf. */
+		(*rxq->mprq_bufs)[rq_idx] = rep;
+		/* Replace WQE. */
+		addr = mlx5_mprq_buf_addr(rep, strd_n);
+		wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+		/* If there's only one MR, no need to replace LKey in WQE. */
+		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+			wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+		/* Stash a mbuf for next replacement. */
+		if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+			rxq->mprq_repl = rep;
+		else
+			rxq->mprq_repl = NULL;
+		/* Release the old buffer. */
+		mlx5_mprq_buf_free(buf);
+	} else if (unlikely(rxq->mprq_repl == NULL)) {
+		struct mlx5_mprq_buf *rep;
+
+		/*
+		 * Currently, the MPRQ mempool is out of buffer
+		 * and doing memcpy regardless of the size of Rx
+		 * packet. Retry allocation to get back to
+		 * normal.
+		 */
+		if (!rte_mempool_get(rxq->mprq_mp, (void **)&rep))
+			rxq->mprq_repl = rep;
+	}
+}
+
+/**
+ * Attach or copy MPRQ buffer content to a packet.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param pkt
+ *   Pointer to a packet to fill.
+ * @param len
+ *   Packet length.
+ * @param buf
+ *   Pointer to a MPRQ buffer to take the data from.
+ * @param strd_idx
+ *   Stride index to start from.
+ * @param strd_cnt
+ *   Number of strides to consume.
+ */
+static __rte_always_inline enum mlx5_rqx_code
+mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
+		struct mlx5_mprq_buf *buf, uint16_t strd_idx, uint16_t strd_cnt)
+{
+	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint16_t strd_sz = 1 << rxq->strd_sz_n;
+	const uint16_t strd_shift =
+		MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+	const int32_t hdrm_overlap =
+		len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
+	const uint32_t offset = strd_idx * strd_sz + strd_shift;
+	void *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+
+	/*
+	 * Memcpy packets to the target mbuf if:
+	 * - The size of packet is smaller than mprq_max_memcpy_len.
+	 * - Out of buffer in the Mempool for Multi-Packet RQ.
+	 * - The packet's stride overlaps a headroom and scatter is off.
+	 */
+	if (len <= rxq->mprq_max_memcpy_len ||
+	    rxq->mprq_repl == NULL ||
+	    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
+		if (likely(len <=
+			   (uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) {
+			rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+				   addr, len);
+			DATA_LEN(pkt) = len;
+		} else if (rxq->strd_scatter_en) {
+			struct rte_mbuf *prev = pkt;
+			uint32_t seg_len = RTE_MIN(len, (uint32_t)
+				(pkt->buf_len - RTE_PKTMBUF_HEADROOM));
+			uint32_t rem_len = len - seg_len;
+
+			rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+				   addr, seg_len);
+			DATA_LEN(pkt) = seg_len;
+			while (rem_len) {
+				struct rte_mbuf *next =
+					rte_pktmbuf_alloc(rxq->mp);
+
+				if (unlikely(next == NULL))
+					return MLX5_RXQ_CODE_NOMBUF;
+				NEXT(prev) = next;
+				SET_DATA_OFF(next, 0);
+				addr = RTE_PTR_ADD(addr, seg_len);
+				seg_len = RTE_MIN(rem_len, (uint32_t)
+					(next->buf_len - RTE_PKTMBUF_HEADROOM));
+				rte_memcpy
+					(rte_pktmbuf_mtod(next, void *),
+					 addr, seg_len);
+				DATA_LEN(next) = seg_len;
+				rem_len -= seg_len;
+				prev = next;
+				++NB_SEGS(pkt);
+			}
+		} else {
+			return MLX5_RXQ_CODE_DROPPED;
+		}
+	} else {
+		rte_iova_t buf_iova;
+		struct rte_mbuf_ext_shared_info *shinfo;
+		uint16_t buf_len = strd_cnt * strd_sz;
+		void *buf_addr;
+
+		/* Increment the refcnt of the whole chunk. */
+		__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
+		MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
+			    __ATOMIC_RELAXED) <= strd_n + 1);
+		buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+		/*
+		 * MLX5 device doesn't use iova but it is necessary in a
+		 * case where the Rx packet is transmitted via a
+		 * different PMD.
+		 */
+		buf_iova = rte_mempool_virt2iova(buf) +
+			   RTE_PTR_DIFF(buf_addr, buf);
+		shinfo = &buf->shinfos[strd_idx];
+		rte_mbuf_ext_refcnt_set(shinfo, 1);
+		/*
+		 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+		 * attaching the stride to mbuf and more offload flags
+		 * will be added below by calling rxq_cq_to_mbuf().
+		 * Other fields will be overwritten.
+		 */
+		rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
+					  buf_len, shinfo);
+		/* Set mbuf head-room. */
+		SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
+		MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
+		MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=
+			len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
+		DATA_LEN(pkt) = len;
+		/*
+		 * Copy the last fragment of a packet (up to headroom
+		 * size bytes) in case there is a stride overlap with
+		 * a next packet's headroom. Allocate a separate mbuf
+		 * to store this fragment and link it. Scatter is on.
+		 */
+		if (hdrm_overlap > 0) {
+			MLX5_ASSERT(rxq->strd_scatter_en);
+			struct rte_mbuf *seg =
+				rte_pktmbuf_alloc(rxq->mp);
+
+			if (unlikely(seg == NULL))
+				return MLX5_RXQ_CODE_NOMBUF;
+			SET_DATA_OFF(seg, 0);
+			rte_memcpy(rte_pktmbuf_mtod(seg, void *),
+				RTE_PTR_ADD(addr, len - hdrm_overlap),
+				hdrm_overlap);
+			DATA_LEN(seg) = hdrm_overlap;
+			DATA_LEN(pkt) = len - hdrm_overlap;
+			NEXT(pkt) = seg;
+			NB_SEGS(pkt) = 2;
+		}
+	}
+	return MLX5_RXQ_CODE_EXIT;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index aa48775738..469ea8401d 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -77,6 +77,177 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	return n;
 }
 
+/**
+ * Replenish buffers for RX in bulk.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ */
+static inline void
+mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
+{
+	const uint16_t q_n = 1 << rxq->elts_n;
+	const uint16_t q_mask = q_n - 1;
+	uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);
+	uint16_t elts_idx = rxq->rq_ci & q_mask;
+	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+	volatile struct mlx5_wqe_data_seg *wq =
+		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
+	unsigned int i;
+
+	if (n >= rxq->rq_repl_thresh) {
+		MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
+		MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
+			    MLX5_VPMD_DESCS_PER_LOOP);
+		/* Not to cross queue end. */
+		n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
+		if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+			rxq->stats.rx_nombuf += n;
+			return;
+		}
+		for (i = 0; i < n; ++i) {
+			void *buf_addr;
+
+			/*
+			 * In order to support the mbufs with external attached
+			 * data buffer we should use the buf_addr pointer
+			 * instead of rte_mbuf_buf_addr(). It touches the mbuf
+			 * itself and may impact the performance.
+			 */
+			buf_addr = elts[i]->buf_addr;
+			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+						      RTE_PKTMBUF_HEADROOM);
+			/* If there's a single MR, no need to replace LKey. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)
+				     > 1))
+				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+		}
+		rxq->rq_ci += n;
+		/* Prevent overflowing into consumed mbufs. */
+		elts_idx = rxq->rq_ci & q_mask;
+		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+			(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
+		rte_io_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	}
+}
+
+/**
+ * Replenish buffers for MPRQ RX in bulk.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ */
+static inline void
+mlx5_rx_mprq_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
+{
+	const uint16_t wqe_n = 1 << rxq->elts_n;
+	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t elts_n = wqe_n * strd_n;
+	const uint32_t wqe_mask = elts_n - 1;
+	uint32_t n = elts_n - (rxq->elts_ci - rxq->rq_pi);
+	uint32_t elts_idx = rxq->elts_ci & wqe_mask;
+	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+
+	/* Not to cross queue end. */
+	if (n >= rxq->rq_repl_thresh) {
+		MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n));
+		MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n) >
+			     MLX5_VPMD_DESCS_PER_LOOP);
+		n = RTE_MIN(n, elts_n - elts_idx);
+		if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+			rxq->stats.rx_nombuf += n;
+			return;
+		}
+		rxq->elts_ci += n;
+	}
+}
+
+/**
+ * Copy or attach MPRQ buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ *
+ * @return
+ *   Number of packets successfully copied/attached (<= pkts_n).
+ */
+static inline uint16_t
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq,
+		     struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	const uint16_t wqe_n = 1 << rxq->elts_n;
+	const uint16_t wqe_mask = wqe_n - 1;
+	const uint16_t strd_sz = 1 << rxq->strd_sz_n;
+	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t elts_n = wqe_n * strd_n;
+	const uint32_t elts_mask = elts_n - 1;
+	uint32_t elts_idx = rxq->rq_pi & elts_mask;
+	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+	uint32_t rq_ci = rxq->rq_ci;
+	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask];
+	uint16_t copied = 0;
+	uint16_t i = 0;
+
+	for (i = 0; i < pkts_n; ++i) {
+		uint16_t strd_cnt;
+		enum mlx5_rqx_code rxq_code;
+
+		if (rxq->consumed_strd == strd_n) {
+			/* Replace WQE if the buffer is still in use. */
+			mprq_buf_replace(rxq, rq_ci & wqe_mask);
+			/* Advance to the next WQE. */
+			rxq->consumed_strd = 0;
+			rq_ci++;
+			buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask];
+		}
+
+		if (!elts[i]->pkt_len) {
+			rxq->consumed_strd = strd_n;
+			rte_pktmbuf_free_seg(elts[i]);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			rxq->stats.ipackets -= 1;
+#endif
+			continue;
+		}
+		strd_cnt = (elts[i]->pkt_len / strd_sz) +
+			   ((elts[i]->pkt_len % strd_sz) ? 1 : 0);
+		rxq_code = mprq_buf_to_pkt(rxq, elts[i], elts[i]->pkt_len,
+					   buf, rxq->consumed_strd, strd_cnt);
+		rxq->consumed_strd += strd_cnt;
+		if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
+			rte_pktmbuf_free_seg(elts[i]);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			rxq->stats.ipackets -= 1;
+			rxq->stats.ibytes -= elts[i]->pkt_len;
+#endif
+			if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
+				++rxq->stats.rx_nombuf;
+				break;
+			}
+			if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
+				++rxq->stats.idropped;
+				continue;
+			}
+		}
+		pkts[copied++] = elts[i];
+	}
+	rxq->rq_pi += i;
+	rxq->cq_ci += i;
+	rte_io_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	if (rq_ci != rxq->rq_ci) {
+		rxq->rq_ci = rq_ci;
+		rte_io_wmb();
+		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	}
+	return copied;
+}
+
 /**
  * Receive burst of packets. An errored completion also consumes a mbuf, but the
  * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
@@ -204,7 +375,142 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	bool no_cq = false;
 
 	do {
-		nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err, &no_cq);
+		nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn,
+				    &err, &no_cq);
+		if (unlikely(err | rxq->err_state))
+			nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
+		tn += nb_rx;
+		if (unlikely(no_cq))
+			break;
+	} while (tn != pkts_n);
+	return tn;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+		 uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+	const uint16_t q_n = 1 << rxq->cqe_n;
+	const uint16_t q_mask = q_n - 1;
+	const uint16_t wqe_n = 1 << rxq->elts_n;
+	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t elts_n = wqe_n * strd_n;
+	const uint32_t elts_mask = elts_n - 1;
+	volatile struct mlx5_cqe *cq;
+	struct rte_mbuf **elts;
+	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+	uint16_t nocmp_n = 0;
+	uint16_t rcvd_pkt = 0;
+	uint16_t cp_pkt = 0;
+	unsigned int cq_idx = rxq->cq_ci & q_mask;
+	unsigned int elts_idx;
+
+	MLX5_ASSERT(rxq->sges_n == 0);
+	cq = &(*rxq->cqes)[cq_idx];
+	rte_prefetch0(cq);
+	rte_prefetch0(cq + 1);
+	rte_prefetch0(cq + 2);
+	rte_prefetch0(cq + 3);
+	pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+	mlx5_rx_mprq_replenish_bulk_mbuf(rxq);
+	/* See if there're unreturned mbufs from compressed CQE. */
+	rcvd_pkt = rxq->decompressed;
+	if (rcvd_pkt > 0) {
+		rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+		cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt);
+		rxq->decompressed -= rcvd_pkt;
+		pkts += cp_pkt;
+	}
+	elts_idx = rxq->rq_pi & elts_mask;
+	elts = &(*rxq->elts)[elts_idx];
+	/* Not to overflow pkts array. */
+	pkts_n = RTE_ALIGN_FLOOR(pkts_n - cp_pkt, MLX5_VPMD_DESCS_PER_LOOP);
+	/* Not to cross queue end. */
+	pkts_n = RTE_MIN(pkts_n, elts_n - elts_idx);
+	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+	/* Not to move past the allocated mbufs. */
+	pkts_n = RTE_MIN(pkts_n, rxq->elts_ci - rxq->rq_pi);
+	if (!pkts_n) {
+		*no_cq = !cp_pkt;
+		return cp_pkt;
+	}
+	/* At this point, there shouldn't be any remaining packets. */
+	MLX5_ASSERT(rxq->decompressed == 0);
+	/* Process all the CQEs */
+	nocmp_n = rxq_cq_process_v(rxq, cq, elts, pkts, pkts_n, err, &comp_idx);
+	/* If no new CQE seen, return without updating cq_db. */
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
+		return cp_pkt;
+	}
+	/* Update the consumer indexes for non-compressed CQEs. */
+	MLX5_ASSERT(nocmp_n <= pkts_n);
+	cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n);
+	rcvd_pkt += cp_pkt;
+	/* Decompress the last CQE if compressed. */
+	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
+		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+		rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+							&elts[nocmp_n]);
+		/* Return more packets if needed. */
+		if (nocmp_n < pkts_n) {
+			uint16_t n = rxq->decompressed;
+
+			n = RTE_MIN(n, pkts_n - nocmp_n);
+			cp_pkt = rxq_copy_mprq_mbuf_v(rxq, &pkts[cp_pkt], n);
+			rcvd_pkt += cp_pkt;
+			rxq->decompressed -= n;
+		}
+	}
+	*no_cq = !rcvd_pkt;
+	return rcvd_pkt;
+}
+
+/**
+ * DPDK callback for vectorized MPRQ RX.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct mlx5_rxq_data *rxq = dpdk_rxq;
+	uint16_t nb_rx = 0;
+	uint16_t tn = 0;
+	uint64_t err = 0;
+	bool no_cq = false;
+
+	do {
+		nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,
+					 &err, &no_cq);
 		if (unlikely(err | rxq->err_state))
 			nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
 		tn += nb_rx;
@@ -229,8 +535,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
 	struct mlx5_rxq_ctrl *ctrl =
 		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 
-	if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
-		return -ENOTSUP;
 	if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
 		return -ENOTSUP;
 	if (rxq->lro)
@@ -257,8 +561,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
 		return -ENOTSUP;
 	if (!priv->config.rx_vec_en)
 		return -ENOTSUP;
-	if (mlx5_mprq_enabled(dev))
-		return -ENOTSUP;
 	/* All the configured queues should support. */
 	for (i = 0; i < priv->rxqs_n; ++i) {
 		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index ce27074b08..93b4f517bb 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -12,7 +12,6 @@
 #include <mlx5_prm.h>
 
 #include "mlx5_autoconf.h"
-
 #include "mlx5_mr.h"
 
 /* HW checksum offload capabilities of vectorized Tx. */
@@ -68,59 +67,4 @@ S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) ==
 S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==
 		  offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);
 
-/**
- * Replenish buffers for RX in bulk.
- *
- * @param rxq
- *   Pointer to RX queue structure.
- */
-static inline void
-mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)
-{
-	const uint16_t q_n = 1 << rxq->elts_n;
-	const uint16_t q_mask = q_n - 1;
-	uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);
-	uint16_t elts_idx = rxq->rq_ci & q_mask;
-	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
-	volatile struct mlx5_wqe_data_seg *wq =
-		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
-	unsigned int i;
-
-	if (n >= rxq->rq_repl_thresh) {
-		MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
-		MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >
-			    MLX5_VPMD_DESCS_PER_LOOP);
-		/* Not to cross queue end. */
-		n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
-		if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
-			rxq->stats.rx_nombuf += n;
-			return;
-		}
-		for (i = 0; i < n; ++i) {
-			void *buf_addr;
-
-			/*
-			 * In order to support the mbufs with external attached
-			 * data buffer we should use the buf_addr pointer
-			 * instead of rte_mbuf_buf_addr(). It touches the mbuf
-			 * itself and may impact the performance.
-			 */
-			buf_addr = elts[i]->buf_addr;
-			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
-						      RTE_PKTMBUF_HEADROOM);
-			/* If there's a single MR, no need to replace LKey. */
-			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)
-				     > 1))
-				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
-		}
-		rxq->rq_ci += n;
-		/* Prevent overflowing into consumed mbufs. */
-		elts_idx = rxq->rq_ci & q_mask;
-		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-			(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;
-		rte_io_wmb();
-		*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
-	}
-}
-
 #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq
  2020-10-21 20:30 ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Alexander Kozyrev
  2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: refactor vectorized Rx routine Alexander Kozyrev
  2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: implement vectorized MPRQ burst Alexander Kozyrev
@ 2020-10-22 15:01   ` Raslan Darawsheh
  2 siblings, 0 replies; 5+ messages in thread
From: Raslan Darawsheh @ 2020-10-22 15:01 UTC (permalink / raw)
  To: Alexander Kozyrev, dev; +Cc: Matan Azrad, Slava Ovsiienko

Hi,

> -----Original Message-----
> From: Alexander Kozyrev <akozyrev@nvidia.com>
> Sent: Wednesday, October 21, 2020 11:30 PM
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; Matan Azrad
> <matan@nvidia.com>; Slava Ovsiienko <viacheslavo@nvidia.com>
> Subject: [PATCH v2 0/2] net/mlx5: add vectorized mprq
> 
> The vectorized Rx burst function helps to accelerate the Rx processing
> by using SIMD (single instruction, multiple data) extensions for the
> multi-buffer packet processing. Pre-allocating multiple mbufs and
> filling them in batches of four greatly improves the throughput of the
> Rx burst routine.
> 
> MPRQ (Multi-Packet Rx Queue) lacks the vectorized version currently.
> It works by posting a single large buffer (consisted of  multiple
> fixed-size strides) in order to receive multiple packets at once on this
> buffer. A Rx packet is then copied to a user-provided mbuf or PMD
> attaches the Rx packet to the mbuf by the pointer to an external buffer.
> 
> It is proposed to add a vectorized MPRQ Rx routine to speed up the MPRQ
> buffer handling as well. It would require pre-allocation of multiple
> mbufs every time we exhaust all the strides from the current MPRQ buffer
> and switch to a new one. The new mlx5_rx_burst_mprq_vec() routine will
> take care of this as well as of decision on whether should we copy or
> attach an external buffer for a packet. The batch processing logic won't
> be different from the simple vectorized Rx routine.
> 
> The new vectorized MPRQ burst function is going to be selected
> automatically whenever the mprq_en devarg is specified. If SIMD is not
> available on the platform we fall back to the simple MPRQ Rx burst
> function. LRO is not supported by the vectorized MPRQ version and fall
> back to the regular MPRQ will be performed.
> 
> 
> Alexander Kozyrev (2):
>   net/mlx5: refactor vectorized Rx routine
>   net/mlx5: implement vectorized MPRQ burst
> 
>  drivers/net/mlx5/mlx5_devx.c             |  15 +-
>  drivers/net/mlx5/mlx5_ethdev.c           |  20 +-
>  drivers/net/mlx5/mlx5_rxq.c              |  96 +++---
>  drivers/net/mlx5/mlx5_rxtx.c             | 237 ++++---------
>  drivers/net/mlx5/mlx5_rxtx.h             | 200 ++++++++++-
>  drivers/net/mlx5/mlx5_rxtx_vec.c         | 416 ++++++++++++++++++++++-
>  drivers/net/mlx5/mlx5_rxtx_vec.h         |  55 ---
>  drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 106 ++----
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 103 ++----
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 121 ++-----
>  10 files changed, 813 insertions(+), 556 deletions(-)
> 
> --
> 2.24.1


Series applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-10-22 15:01 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-19  4:11 [dpdk-dev] [PATCH] net/mlx5: implement vectorized MPRQ burst Alexander Kozyrev
2020-10-21 20:30 ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Alexander Kozyrev
2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: refactor vectorized Rx routine Alexander Kozyrev
2020-10-21 20:30   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: implement vectorized MPRQ burst Alexander Kozyrev
2020-10-22 15:01   ` [dpdk-dev] [PATCH v2 0/2] net/mlx5: add vectorized mprq Raslan Darawsheh

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git