DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] net/mlx5: support Flow Tag and Packet Header miniCQEs
@ 2020-10-28  2:37 Alexander Kozyrev
  2020-11-01 16:27 ` [dpdk-dev] [PATCH v2] " Alexander Kozyrev
  0 siblings, 1 reply; 5+ messages in thread
From: Alexander Kozyrev @ 2020-10-28  2:37 UTC (permalink / raw)
  To: dev; +Cc: stable, rasland, viacheslavo, matan

CQE compression allows us to save the PCI bandwidth and improve
the performance by compressing several CQEs togheter to a miniCQE.
But the miniCQE size is only 8 bytes and this limits the ability
to sucessfuly keep the compression session in case of various
traffic patterns.

The current miniCQE format only keeps the compression session alive
in case of uniform traffic with the Hash RSS as the only difference.
There are requests to keep the compression session in case of tagged
traffic by RTE Flow Mark Id and mixed UDP/TCP and IPv4/IPv6 traffic.
Add 2 new miniCQE formats in order to achieve the best performance
for these traffic patterns: Flow Tag and Packet Header miniCQEs.

The existing rxq_cqe_comp_en devarg is modified to specify the
desired miniCQE format. Specifying 2 selects Flow Tag format
for better compression rate in case of RTE Flow Mark traffic.
Specifying 3 selects Checksum format (existing format for MPRQ).
Specifying 4 selects L3/L4 Header format for better compression
rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
 doc/guides/nics/mlx5.rst               |   8 +
 doc/guides/rel_notes/release_20_11.rst |   2 +
 drivers/common/mlx5/mlx5_devx_cmds.c   |   7 +-
 drivers/common/mlx5/mlx5_devx_cmds.h   |   1 +
 drivers/common/mlx5/mlx5_prm.h         |  27 +++-
 drivers/net/mlx5/mlx5.c                |   7 +
 drivers/net/mlx5/mlx5.h                |   1 +
 drivers/net/mlx5/mlx5_devx.c           |  42 +++--
 drivers/net/mlx5/mlx5_rxtx.c           | 134 ++++++++++------
 drivers/net/mlx5/mlx5_rxtx.h           |   2 +
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h   | 202 +++++++++++++++++--------
 11 files changed, 303 insertions(+), 130 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index e5e55fc409..72b026a5aa 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -431,6 +431,14 @@ Driver options
 
   A nonzero value enables the compression of CQE on RX side. This feature
   allows to save PCI bandwidth and improve performance. Enabled by default.
+  Different compression formats are supported in order to achieve the best
+  performance for different traffic patterns. Hash RSS format is the default.
+
+  Specifying 2 as a ``rxq_cqe_comp_en`` value selects Flow Tag format for
+  better compression rate in case of RTE Flow Mark traffic.
+  Specifying 3 as a ``rxq_cqe_comp_en`` value selects Checksum format.
+  Specifying 4 as a ``rxq_cqe_comp_en`` value selects L3/L4 Header format for
+  better compression rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.
 
   Supported on:
 
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index e2847712e8..9bd0f96c12 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -358,6 +358,8 @@ New Features
   * Added support for QinQ packets matching.
   * Added support for the new vlan fields ``has_vlan`` in the eth item and
     ``has_more_vlan`` in the vlan item.
+  * Added vectorized Multi-Packet Rx Queue burst.
+  * Added support for 2 new miniCQE formats: Flow Tag and L3/L4 header.
 
 * **Updated vhost sample application.**
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..586bdda6aa 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1564,8 +1564,11 @@ mlx5_devx_cmd_create_cq(void *ctx, struct mlx5_devx_cq_attr *attr)
 		 MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET(cqc, cqctx, c_eqn, attr->eqn);
 	MLX5_SET(cqc, cqctx, uar_page, attr->uar_page_id);
-	MLX5_SET(cqc, cqctx, cqe_comp_en, attr->cqe_comp_en);
-	MLX5_SET(cqc, cqctx, mini_cqe_res_format, attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, cqe_comp_en, !!attr->cqe_comp_en);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format,
+		 attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format_ext,
+		 attr->mini_cqe_res_format_ext);
 	MLX5_SET(cqc, cqctx, cqe_sz, attr->cqe_size);
 	if (attr->q_umem_valid) {
 		MLX5_SET(create_cq_in, in, cq_umem_valid, attr->q_umem_valid);
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..ab33ce3046 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -255,6 +255,7 @@ struct mlx5_devx_cq_attr {
 	uint32_t overrun_ignore:1;
 	uint32_t cqe_comp_en:1;
 	uint32_t mini_cqe_res_format:2;
+	uint32_t mini_cqe_res_format_ext:2;
 	uint32_t cqe_size:3;
 	uint32_t log_cq_size:5;
 	uint32_t log_page_size:5;
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..b893d8a348 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -239,6 +239,9 @@
 /* Default mark mask for metadata legacy mode. */
 #define MLX5_FLOW_MARK_MASK 0xffffff
 
+/* Byte length mask when mark is enable in miniCQE */
+#define MLX5_LEN_WITH_MARK_MASK 0xffffff00
+
 /* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
@@ -2152,11 +2155,14 @@ struct mlx5_ifc_cqc_bits {
 	u8 cqe_comp_en[0x1];
 	u8 mini_cqe_res_format[0x2];
 	u8 st[0x4];
-	u8 reserved_at_18[0x8];
+	u8 reserved_at_18[0x1];
+	u8 cqe_comp_layout[0x7];
 	u8 dbr_umem_id[0x20];
 	u8 reserved_at_40[0x14];
 	u8 page_offset[0x6];
-	u8 reserved_at_5a[0x6];
+	u8 reserved_at_5a[0x2];
+	u8 mini_cqe_res_format_ext[0x2];
+	u8 cq_timestamp_format[0x2];
 	u8 reserved_at_60[0x3];
 	u8 log_cq_size[0x5];
 	u8 uar_page[0x18];
@@ -2913,7 +2919,14 @@ struct mlx5_mini_cqe8 {
 	union {
 		uint32_t rx_hash_result;
 		struct {
-			uint16_t checksum;
+			union {
+				uint16_t checksum;
+				uint16_t flow_tag_high;
+				union {
+					uint8_t reserved;
+					uint8_t hdr_type;
+				};
+			};
 			uint16_t stride_idx;
 		};
 		struct {
@@ -2922,15 +2935,19 @@ struct mlx5_mini_cqe8 {
 			uint8_t  reserved;
 		} s_wqe_info;
 	};
-	uint32_t byte_cnt;
+	union {
+		uint32_t byte_cnt_flow;
+		uint32_t byte_cnt;
+	};
 };
 
 /* Mini CQE responder format. */
 enum {
 	MLX5_CQE_RESP_FORMAT_HASH = 0x0,
 	MLX5_CQE_RESP_FORMAT_CSUM = 0x1,
-	MLX5_CQE_RESP_FORMAT_CSUM_FLOW_TAG = 0x2,
+	MLX5_CQE_RESP_FORMAT_FTAG_STRIDX = 0x2,
 	MLX5_CQE_RESP_FORMAT_CSUM_STRIDX = 0x3,
+	MLX5_CQE_RESP_FORMAT_L34H_STRIDX = 0x4,
 };
 
 /* srTCM PRM flow meter parameters. */
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 27c9c2abb6..9fd8f0ebbf 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1545,7 +1545,14 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	}
 	mod = tmp >= 0 ? tmp : -tmp;
 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+		if (tmp > MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+			DRV_LOG(ERR, "invalid CQE compression "
+				     "format parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
 		config->cqe_comp = !!tmp;
+		config->cqe_comp_fmt = tmp;
 	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
 		config->cqe_pad = !!tmp;
 	} else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8de5842cc7..941a049179 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -192,6 +192,7 @@ struct mlx5_dev_config {
 	/* Whether tunnel stateless offloads are supported. */
 	unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
+	unsigned int cqe_comp_fmt:3; /* CQE compression format. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 5fce4cd555..1b179abe95 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -437,17 +437,37 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)
 	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
 	    !rxq_data->lro) {
 		cq_attr.cqe_comp_en = 1u;
-		/*
-		 * Select CSUM miniCQE format only for non-vectorized MPRQ
-		 * Rx burst, use HASH miniCQE format for everything else.
-		 */
-		if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
-			mlx5_rxq_mprq_enabled(rxq_data))
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
-		else
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_HASH;
+		rxq_data->mcqe_format = priv->config.cqe_comp_fmt;
+		rxq_data->byte_mask = UINT32_MAX;
+		switch (priv->config.cqe_comp_fmt) {
+		case MLX5_CQE_RESP_FORMAT_HASH:
+		case MLX5_CQE_RESP_FORMAT_CSUM:
+			/*
+			 * Select CSUM miniCQE format only for non-vectorized
+			 * MPRQ Rx burst, use HASH miniCQE format for others.
+			 */
+			if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
+			    mlx5_rxq_mprq_enabled(rxq_data))
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+			else
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_HASH;
+			rxq_data->mcqe_format = cq_attr.mini_cqe_res_format;
+			break;
+		case MLX5_CQE_RESP_FORMAT_FTAG_STRIDX:
+			rxq_data->byte_mask = MLX5_LEN_WITH_MARK_MASK;
+		case MLX5_CQE_RESP_FORMAT_CSUM_STRIDX:
+			cq_attr.mini_cqe_res_format = priv->config.cqe_comp_fmt;
+			break;
+		case MLX5_CQE_RESP_FORMAT_L34H_STRIDX:
+			cq_attr.mini_cqe_res_format = 0;
+			cq_attr.mini_cqe_res_format_ext = 1;
+			break;
+		}
+		DRV_LOG(DEBUG,
+			"Port %u Rx CQE compression is enabled, format %d.",
+			dev->data->port_id, priv->config.cqe_comp_fmt);
 		/*
 		 * For vectorized Rx, it must not be doubled in order to
 		 * make cq_ci and rq_ci aligned.
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2ffacf8882..1ecae79372 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -80,7 +80,8 @@ static uint16_t mlx5_tx_burst_##func(void *txq, \
 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
 
 static __rte_always_inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe);
 
 static __rte_always_inline int
 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
@@ -91,7 +92,8 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
 
 static __rte_always_inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe);
 
 static int
 mlx5_queue_state_modify(struct rte_eth_dev *dev,
@@ -100,12 +102,13 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev,
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum);
+			uint32_t phcsum, uint8_t l4_type);
 
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len);
+			volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len);
 
 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
@@ -813,12 +816,19 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
  *   Packet type for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe)
 {
 	uint8_t idx;
-	uint8_t pinfo = cqe->pkt_info;
-	uint16_t ptype = cqe->hdr_type_etc;
+	uint8_t ptype;
+	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
 
+	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
+	if (unlikely(mcqe == NULL ||
+		rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX))
+		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
+	else
+		ptype = mcqe->hdr_type >> 2;
 	/*
 	 * The index to the array should have:
 	 * bit[1:0] = l3_hdr_type
@@ -827,7 +837,7 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
 	 * bit[6] = tunneled
 	 * bit[7] = outer_l3_type
 	 */
-	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+	idx = pinfo | ptype;
 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
 }
 
@@ -1131,8 +1141,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				(volatile struct mlx5_mini_cqe8 (*)[8])
 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
 							  cqe_cnt].pkt_info);
-
-			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
+					       rxq->byte_mask);
 			*mcqe = &(*mc)[zip->ai & 7];
 			if ((++zip->ai & 7) == 0) {
 				/* Invalidate consumed CQEs */
@@ -1210,7 +1220,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				--rxq->cq_ci;
 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
 				/* Get packet size to return. */
-				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
+						       rxq->byte_mask);
 				*mcqe = &(*mc)[0];
 				zip->ai = 1;
 				/* Prefetch all to be invalidated */
@@ -1274,20 +1285,35 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
  */
 static inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe)
 {
-	/* Update packet information. */
-	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
+	/* If compressed, take hash result from mini-CQE. */
+	uint32_t rss_hash_res = 0;
+	uint32_t mark = 0;
+
+		/* Update packet information. */
+	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
+	if (mcqe == NULL || rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
+		rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+	else
+		rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
+
 	if (rss_hash_res && rxq->rss_hash) {
 		pkt->hash.rss = rss_hash_res;
 		pkt->ol_flags |= PKT_RX_RSS_HASH;
 	}
-	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
+	/* If compressed, take flow tag from mini-CQE. */
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+		mark = cqe->sop_drop_qpn;
+	else
+		mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
+			(mcqe->flow_tag_high << 16);
+	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(mark)) {
 		pkt->ol_flags |= PKT_RX_FDIR;
-		if (cqe->sop_drop_qpn !=
+		if (mark !=
 		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
-			uint32_t mark = cqe->sop_drop_qpn;
-
 			pkt->ol_flags |= PKT_RX_FDIR_ID;
 			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
 		}
@@ -1299,10 +1325,20 @@ rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
 	}
 	if (rxq->csum)
 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
-	if (rxq->vlan_strip &&
-	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
-		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
-		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+	if (rxq->vlan_strip) {
+		bool vlan_strip;
+
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+			vlan_strip = cqe->hdr_type_etc &
+				     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED);
+		else
+			vlan_strip = mcqe->hdr_type &
+				     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED);
+		if (vlan_strip) {
+			pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+		}
 	}
 	if (rxq->hw_timestamp) {
 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
@@ -1348,7 +1384,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res;
 
 		if (pkt)
 			NEXT(seg) = rep;
@@ -1387,18 +1422,14 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt = seg;
 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
 			pkt->ol_flags &= EXT_ATTACHED_MBUF;
-			/* If compressed, take hash result from mini-CQE. */
-			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
-							cqe->rx_hash_res :
-							mcqe->rx_hash_result);
-			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 			if (rxq->crc_present)
 				len -= RTE_ETHER_CRC_LEN;
 			PKT_LEN(pkt) = len;
 			if (cqe->lro_num_seg > 1) {
 				mlx5_lro_update_hdr
 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
-					 len);
+					 mcqe, rxq, len);
 				pkt->ol_flags |= PKT_RX_LRO;
 				pkt->tso_segsz = len / cqe->lro_num_seg;
 			}
@@ -1468,10 +1499,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum)
+			uint32_t phcsum, uint8_t l4_type)
 {
-	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
-			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
 	/*
 	 * The HW calculates only the TCP payload checksum, need to complete
 	 * the TCP header checksum and the L3 pseudo-header checksum.
@@ -1510,7 +1539,8 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len)
+			volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len)
 {
 	union {
 		struct rte_ether_hdr *eth;
@@ -1524,6 +1554,7 @@ mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 	};
 	uint16_t proto = h.eth->ether_type;
 	uint32_t phcsum;
+	uint8_t l4_type;
 
 	h.eth++;
 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
@@ -1545,7 +1576,14 @@ mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
 		h.ipv6++;
 	}
-	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	else
+		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
 }
 
 void
@@ -1586,6 +1624,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct mlx5_rxq_data *rxq = dpdk_rxq;
 	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t strd_sz = 1 << rxq->strd_sz_n;
 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
@@ -1602,7 +1641,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint16_t strd_idx;
 		uint32_t byte_cnt;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res = 0;
 		enum mlx5_rqx_code rxq_code;
 
 		if (consumed_strd == strd_n) {
@@ -1618,19 +1656,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (!ret)
 			break;
 		byte_cnt = ret;
-		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
-			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
+		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+		if (rxq->crc_present)
+			len -= RTE_ETHER_CRC_LEN;
+		if (mcqe &&
+		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+			strd_cnt = (len / strd_sz) + (len % strd_sz) ? 1 : 0;
+		else
+			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
 		MLX5_ASSERT(strd_cnt);
 		consumed_strd += strd_cnt;
 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
 			continue;
-		if (mcqe == NULL) {
-			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
-			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
-		} else {
-			/* mini-CQE for MPRQ doesn't have hash result. */
-			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
-		}
+		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
+					cqe->wqe_counter :
+					mcqe->stride_idx);
 		MLX5_ASSERT(strd_idx < strd_n);
 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
 			    wq_mask));
@@ -1656,10 +1698,10 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				break;
 			}
 		}
-		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 		if (cqe->lro_num_seg > 1) {
 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
-					    cqe, len);
+					    cqe, mcqe, rxq, len);
 			pkt->ol_flags |= PKT_RX_LRO;
 			pkt->tso_segsz = len / cqe->lro_num_seg;
 		}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1b5fba4ac7..b3038c4991 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -127,6 +127,8 @@ struct mlx5_rxq_data {
 	unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */
 	unsigned int lro:1; /* Enable LRO. */
 	unsigned int dynf_meta:1; /* Dynamic metadata is configured. */
+	unsigned int mcqe_format:3; /* Dynamic metadata is configured. */
+	uint32_t byte_mask;
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t port_id;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 59662fa12d..7bae27e5ef 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -78,33 +78,47 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	unsigned int pos;
 	unsigned int i;
 	unsigned int inv = 0;
+	const int32_t head =
+		(rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_L34H_STRIDX) ? 0 : -1;
+	const int32_t ftag =
+		(rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) ? 0 : -1;
+	const int32_t hash =
+		(rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_HASH) ? 0 : -1;
 	/* Mask to shuffle from extracted mini CQE to mbuf. */
 	const __m128i shuf_mask1 =
-		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
-			    -1, -1,         /* skip vlan_tci */
-			     6,  7,         /* data_len, bswap16 */
-			    -1, -1,  6,  7, /* pkt_len, bswap16 */
-			    -1, -1, -1, -1  /* skip packet_type */);
+		_mm_set_epi8(-1, 1 | ftag, 0 | ftag, 4 | ftag, /* fdir.hi */
+			      0 | hash, 1 | hash, 2 | hash, 3 | hash, /* rss */
+			     -1, -1,	/* skip vlan_tci */
+			      6,  7,	/* data_len, bswap16 */
+			     -1, -1,	/* zero out 2nd half of pkt_len */
+			      6,  7	/* pkt_len, bswap16 */);
 	const __m128i shuf_mask2 =
-		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
-			    -1, -1,         /* skip vlan_tci */
-			    14, 15,         /* data_len, bswap16 */
-			    -1, -1, 14, 15, /* pkt_len, bswap16 */
-			    -1, -1, -1, -1  /* skip packet_type */);
+		_mm_set_epi8(-1, 9 | ftag, 8 | ftag, 12 | ftag, /* fdir.hi */
+			      8 | hash, 9 | hash, 10 | hash, 11 | hash,/* rss */
+			     -1, -1,	/* skip vlan_tci */
+			     14, 15,	/* data_len, bswap16 */
+			     -1, -1,	/* zero out 2nd half of pkt_len */
+			     14, 15	/* pkt_len, bswap16 */);
 	/* Restore the compressed count. Must be 16 bits. */
 	const uint16_t mcqe_n = t_pkt->data_len +
 				(rxq->crc_present * RTE_ETHER_CRC_LEN);
 	const __m128i rearm =
 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
-	const __m128i rxdf =
-		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
+	const __m128i rearm_flags =
+		_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
 	const __m128i crc_adj =
-		_mm_set_epi16(0, 0, 0,
+		_mm_set_epi16(0, 0, 0, 0, 0,
 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
 			      0,
-			      rxq->crc_present * RTE_ETHER_CRC_LEN,
-			      0, 0);
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+			      rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+	const __m128i ol_flags_mask = _mm_set1_epi32(PKT_RX_RSS_HASH * !hash |
+			PKT_RX_VLAN * !head | PKT_RX_VLAN_STRIPPED * !head |
+			PKT_RX_FDIR * !ftag | PKT_RX_FDIR_ID * !ftag);
+	__m128i ol_flags =
+		_mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH * !hash);
+	__m128i rearm0, rearm1, rearm2, rearm3;
+
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	const __m128i zero = _mm_setzero_si128();
 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
@@ -118,14 +132,16 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 #endif
 	/*
 	 * A. load mCQEs into a 128bit register.
-	 * B. store rearm data to mbuf.
-	 * C. combine data from mCQEs with rx_descriptor_fields1.
-	 * D. store rx_descriptor_fields1.
-	 * E. store flow tag (rte_flow mark).
+	 * B. combine data from mCQEs with rx_descriptor_fields1.
+	 * C. store rx_descriptor_fields1.
+	 * D. update and store packet type.
+	 * E. update ol_flags according to miniCQEs content.
+	 * F. store dynamic metadata.
+	 * G. store rearm data to mbuf.
 	 */
 	for (pos = 0; pos < mcqe_n; ) {
 		__m128i mcqe1, mcqe2;
-		__m128i rxdf1, rxdf2;
+		__m128i rxdf1, rxdf2, rxdf3, rxdf4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		__m128i byte_cnt, invalid_mask;
 #endif
@@ -136,59 +152,107 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		/* A.1 load mCQEs into a 128bit register. */
 		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
 		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
-		/* B.1 store rearm data to mbuf. */
-		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
-		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
-		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		/* B.1 combine data from mCQEs with rx_descriptor_fields1. */
 		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
 		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
-		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
-		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
-		/* D.1 store rx_descriptor_fields1. */
+		rxdf1 = _mm_add_epi32(rxdf1, flow_mark_adj);
+		rxdf2 = _mm_add_epi32(rxdf2, flow_mark_adj);
+		/* C.1 store rx_descriptor_fields1. */
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos]->rx_descriptor_fields1,
+				  &elts[pos]->pkt_len,
 				 rxdf1);
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos + 1]->rx_descriptor_fields1,
+				  &elts[pos + 1]->pkt_len,
 				 rxdf2);
-		/* B.1 store rearm data to mbuf. */
-		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
-		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
-		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
-		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
-		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
-		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
-		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
-		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
-		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
-		/* D.1 store rx_descriptor_fields1. */
+		/* B.1 combine data from mCQEs with rx_descriptor_fields1. */
+		rxdf3 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
+		rxdf4 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
+		rxdf3 = _mm_sub_epi16(rxdf3, crc_adj);
+		rxdf4 = _mm_sub_epi16(rxdf4, crc_adj);
+		rxdf3 = _mm_add_epi32(rxdf3, flow_mark_adj);
+		rxdf4 = _mm_add_epi32(rxdf4, flow_mark_adj);
+		/* C.1 store rx_descriptor_fields1. */
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos + 2]->rx_descriptor_fields1,
-				 rxdf1);
+				  &elts[pos + 2]->pkt_len,
+				 rxdf3);
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos + 3]->rx_descriptor_fields1,
-				 rxdf2);
+				  &elts[pos + 3]->pkt_len,
+				 rxdf4);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		invalid_mask = _mm_set_epi64x(0,
 					      (mcqe_n - pos) *
 					      sizeof(uint16_t) * 8);
 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
-		mcqe1 = _mm_srli_si128(mcqe1, 4);
-		byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc);
+		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
+					   mcqe2, 0xcc);
 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
 #endif
-		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+		/* D.1 update and store packet type. */
+		if (head == -1) {
+			const uint32_t packet_type = t_pkt->packet_type;
+
+			elts[pos]->packet_type = packet_type;
+			elts[pos + 1]->packet_type = packet_type;
+			elts[pos + 2]->packet_type = packet_type;
+			elts[pos + 3]->packet_type = packet_type;
+		} else {
+			const uint8_t pkt_info = (cq->pkt_info & 0x3) << 6;
+			const uint8_t pt_idx0 = pkt_info |
+						_mm_extract_epi8(mcqe1, 0) >> 2;
+			const uint8_t pt_idx1 = pkt_info |
+						_mm_extract_epi8(mcqe1, 8) >> 2;
+			const uint8_t pt_idx2 = pkt_info |
+						_mm_extract_epi8(mcqe2, 0) >> 2;
+			const uint8_t pt_idx3 = pkt_info |
+						_mm_extract_epi8(mcqe2, 8) >> 2;
+			const __m128i vlan_mask =
+				_mm_set_epi32(_mm_extract_epi8(mcqe1, 0) & 0x1,
+					      _mm_extract_epi8(mcqe1, 8) & 0x1,
+					      _mm_extract_epi8(mcqe2, 0) & 0x1,
+					      _mm_extract_epi8(mcqe2, 8) & 0x1);
+
+			elts[pos]->packet_type = mlx5_ptype_table[pt_idx0] |
+				!!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+			elts[pos + 1]->packet_type = mlx5_ptype_table[pt_idx1] |
+				!!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+			elts[pos + 2]->packet_type = mlx5_ptype_table[pt_idx2] |
+				!!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+			elts[pos + 3]->packet_type = mlx5_ptype_table[pt_idx3] |
+				!!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+			ol_flags = _mm_or_si128(ol_flags, vlan_mask);
 		}
+		/* E.1 update ol_flags according to miniCQEs content. */
+		if (rxq->mark && ftag == 0) {
+			/* Extract flow_tag field. */
+			const __m128i ftag0 = _mm_unpackhi_epi32(rxdf1, rxdf2);
+			const __m128i ftag1 = _mm_unpackhi_epi32(rxdf3, rxdf4);
+			const __m128i ftag = _mm_unpackhi_epi64(ftag0, ftag1);
+			const __m128i ft_mask = _mm_set1_epi32(0xffffff00);
+			const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR);
+			__m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID);
+			__m128i flow_tag, invalid_mask;
+
+			flow_tag = _mm_and_si128(ftag, ft_mask);
+			/* Check if flow tag is non-zero - set PKT_RX_FDIR. */
+			invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
+			ol_flags = _mm_or_si128(ol_flags,
+						_mm_andnot_si128(invalid_mask,
+								fdir_flags));
+			/* Mask out invalid entries. */
+			fdir_id_flags = _mm_andnot_si128(invalid_mask,
+							 fdir_id_flags);
+			/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+			ol_flags = _mm_or_si128(ol_flags,
+				_mm_andnot_si128(_mm_cmpeq_epi32(flow_tag,
+								 ft_mask),
+				fdir_id_flags));
+		}
+		/* F. store dynamic metadata. */
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
 			const uint32_t meta =
@@ -208,6 +272,21 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 							uint32_t *) = meta;
 			}
 		}
+		/* Merge rearm and ol_flags. */
+		ol_flags = _mm_or_si128(ol_flags,
+				_mm_andnot_si128(ol_flags_mask, rearm_flags));
+		rearm0 = _mm_blend_epi16(rearm,
+					 _mm_slli_si128(ol_flags, 8), 0x30);
+		rearm1 = _mm_blend_epi16(rearm,
+					 _mm_slli_si128(ol_flags, 4), 0x30);
+		rearm2 = _mm_blend_epi16(rearm, ol_flags, 0x30);
+		rearm3 = _mm_blend_epi16(rearm,
+					 _mm_srli_si128(ol_flags, 4), 0x30);
+		/* G.1 store rearm data to mbuf. */
+		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm0);
+		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm1);
+		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm2);
+		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm3);
 		pos += MLX5_VPMD_DESCS_PER_LOOP;
 		/* Move to next CQE and invalidate consumed CQEs. */
 		if (!(pos & 0x7) && pos < mcqe_n) {
@@ -251,12 +330,9 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 					  rxq->hw_timestamp * PKT_RX_TIMESTAMP);
 	__m128i cv_flags;
 	const __m128i zero = _mm_setzero_si128();
-	const __m128i ptype_mask =
-		_mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
-	const __m128i ptype_ol_mask =
-		_mm_set_epi32(0x106, 0x106, 0x106, 0x106);
-	const __m128i pinfo_mask =
-		_mm_set_epi32(0x3, 0x3, 0x3, 0x3);
+	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
+	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
+	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
 	const __m128i cv_flag_sel =
 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
 			     (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
@@ -268,13 +344,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 			     (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
 			     0);
 	const __m128i cv_mask =
-		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		_mm_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
 	const __m128i mbuf_init =
 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH v2] net/mlx5: support Flow Tag and Packet Header miniCQEs
  2020-10-28  2:37 [dpdk-dev] [PATCH] net/mlx5: support Flow Tag and Packet Header miniCQEs Alexander Kozyrev
@ 2020-11-01 16:27 ` Alexander Kozyrev
  2020-11-01 16:38   ` Slava Ovsiienko
  0 siblings, 1 reply; 5+ messages in thread
From: Alexander Kozyrev @ 2020-11-01 16:27 UTC (permalink / raw)
  To: dev; +Cc: rasland, viacheslavo, matan

CQE compression allows us to save the PCI bandwidth and improve
the performance by compressing several CQEs together to a miniCQE.
But the miniCQE size is only 8 bytes and this limits the ability
to successfully keep the compression session in case of various
traffic patterns.

The current miniCQE format only keeps the compression session alive
in case of uniform traffic with the Hash RSS as the only difference.
There are requests to keep the compression session in case of tagged
traffic by RTE Flow Mark Id and mixed UDP/TCP and IPv4/IPv6 traffic.
Add 2 new miniCQE formats in order to achieve the best performance
for these traffic patterns: Flow Tag and Packet Header miniCQEs.

The existing rxq_cqe_comp_en devarg is modified to specify the
desired miniCQE format. Specifying 2 selects Flow Tag format
for better compression rate in case of RTE Flow Mark traffic.
Specifying 3 selects Checksum format (existing format for MPRQ).
Specifying 4 selects L3/L4 Header format for better compression
rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
v1: https://patchwork.dpdk.org/patch/82500/
    added ARM and PowerPC support, reworked x86 code
---
 doc/guides/nics/mlx5.rst                 |   8 +
 doc/guides/rel_notes/release_20_11.rst   |   2 +
 drivers/common/mlx5/mlx5_devx_cmds.c     |   7 +-
 drivers/common/mlx5/mlx5_devx_cmds.h     |   1 +
 drivers/common/mlx5/mlx5_prm.h           |  27 ++-
 drivers/net/mlx5/mlx5.c                  |   7 +
 drivers/net/mlx5/mlx5.h                  |   1 +
 drivers/net/mlx5/mlx5_devx.c             |  44 +++--
 drivers/net/mlx5/mlx5_rxtx.c             | 151 +++++++++------
 drivers/net/mlx5/mlx5_rxtx.h             |   2 +
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 223 ++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 141 +++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 167 ++++++++++++++---
 13 files changed, 675 insertions(+), 106 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 8c59cc6a60..5d3d0c4440 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -432,6 +432,14 @@ Driver options
 
   A nonzero value enables the compression of CQE on RX side. This feature
   allows to save PCI bandwidth and improve performance. Enabled by default.
+  Different compression formats are supported in order to achieve the best
+  performance for different traffic patterns. Hash RSS format is the default.
+
+  Specifying 2 as a ``rxq_cqe_comp_en`` value selects Flow Tag format for
+  better compression rate in case of RTE Flow Mark traffic.
+  Specifying 3 as a ``rxq_cqe_comp_en`` value selects Checksum format.
+  Specifying 4 as a ``rxq_cqe_comp_en`` value selects L3/L4 Header format for
+  better compression rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.
 
   Supported on:
 
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index eb5908fd37..36d8ff6219 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -231,6 +231,8 @@ New Features
   * Added support for the new vlan fields ``has_vlan`` in the eth item and
     ``has_more_vlan`` in the vlan item.
   * Added support for PMD level multiple-thread flow insertion.
+  * Added vectorized Multi-Packet Rx Queue burst.
+  * Added support for 2 new miniCQE formats: Flow Tag and L3/L4 header.
 
   Updated Mellanox mlx5 vDPA driver:
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index a13febebfd..e1ac62a352 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1565,8 +1565,11 @@ mlx5_devx_cmd_create_cq(void *ctx, struct mlx5_devx_cq_attr *attr)
 		 MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET(cqc, cqctx, c_eqn, attr->eqn);
 	MLX5_SET(cqc, cqctx, uar_page, attr->uar_page_id);
-	MLX5_SET(cqc, cqctx, cqe_comp_en, attr->cqe_comp_en);
-	MLX5_SET(cqc, cqctx, mini_cqe_res_format, attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, cqe_comp_en, !!attr->cqe_comp_en);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format,
+		 attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format_ext,
+		 attr->mini_cqe_res_format_ext);
 	MLX5_SET(cqc, cqctx, cqe_sz, attr->cqe_size);
 	if (attr->q_umem_valid) {
 		MLX5_SET(create_cq_in, in, cq_umem_valid, attr->q_umem_valid);
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index 472d1cb658..7feaafacf1 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -256,6 +256,7 @@ struct mlx5_devx_cq_attr {
 	uint32_t overrun_ignore:1;
 	uint32_t cqe_comp_en:1;
 	uint32_t mini_cqe_res_format:2;
+	uint32_t mini_cqe_res_format_ext:2;
 	uint32_t cqe_size:3;
 	uint32_t log_cq_size:5;
 	uint32_t log_page_size:5;
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 7d671a3996..d8437d13f6 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -239,6 +239,9 @@
 /* Default mark mask for metadata legacy mode. */
 #define MLX5_FLOW_MARK_MASK 0xffffff
 
+/* Byte length mask when mark is enable in miniCQE */
+#define MLX5_LEN_WITH_MARK_MASK 0xffffff00
+
 /* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
@@ -2152,11 +2155,14 @@ struct mlx5_ifc_cqc_bits {
 	u8 cqe_comp_en[0x1];
 	u8 mini_cqe_res_format[0x2];
 	u8 st[0x4];
-	u8 reserved_at_18[0x8];
+	u8 reserved_at_18[0x1];
+	u8 cqe_comp_layout[0x7];
 	u8 dbr_umem_id[0x20];
 	u8 reserved_at_40[0x14];
 	u8 page_offset[0x6];
-	u8 reserved_at_5a[0x6];
+	u8 reserved_at_5a[0x2];
+	u8 mini_cqe_res_format_ext[0x2];
+	u8 cq_timestamp_format[0x2];
 	u8 reserved_at_60[0x3];
 	u8 log_cq_size[0x5];
 	u8 uar_page[0x18];
@@ -2918,7 +2924,14 @@ struct mlx5_mini_cqe8 {
 	union {
 		uint32_t rx_hash_result;
 		struct {
-			uint16_t checksum;
+			union {
+				uint16_t checksum;
+				uint16_t flow_tag_high;
+				struct {
+					uint8_t reserved;
+					uint8_t hdr_type;
+				};
+			};
 			uint16_t stride_idx;
 		};
 		struct {
@@ -2927,15 +2940,19 @@ struct mlx5_mini_cqe8 {
 			uint8_t  reserved;
 		} s_wqe_info;
 	};
-	uint32_t byte_cnt;
+	union {
+		uint32_t byte_cnt_flow;
+		uint32_t byte_cnt;
+	};
 };
 
 /* Mini CQE responder format. */
 enum {
 	MLX5_CQE_RESP_FORMAT_HASH = 0x0,
 	MLX5_CQE_RESP_FORMAT_CSUM = 0x1,
-	MLX5_CQE_RESP_FORMAT_CSUM_FLOW_TAG = 0x2,
+	MLX5_CQE_RESP_FORMAT_FTAG_STRIDX = 0x2,
 	MLX5_CQE_RESP_FORMAT_CSUM_STRIDX = 0x3,
+	MLX5_CQE_RESP_FORMAT_L34H_STRIDX = 0x4,
 };
 
 /* srTCM PRM flow meter parameters. */
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 862bd401d9..67a530d85a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1361,7 +1361,14 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	}
 	mod = tmp >= 0 ? tmp : -tmp;
 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+		if (tmp > MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+			DRV_LOG(ERR, "invalid CQE compression "
+				     "format parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
 		config->cqe_comp = !!tmp;
+		config->cqe_comp_fmt = tmp;
 	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
 		config->cqe_pad = !!tmp;
 	} else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index b080426b72..265e8f09ee 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -205,6 +205,7 @@ struct mlx5_dev_config {
 	/* Whether tunnel stateless offloads are supported. */
 	unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
+	unsigned int cqe_comp_fmt:3; /* CQE compression format. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 5fce4cd555..b3acbc24fc 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -437,17 +437,39 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)
 	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
 	    !rxq_data->lro) {
 		cq_attr.cqe_comp_en = 1u;
-		/*
-		 * Select CSUM miniCQE format only for non-vectorized MPRQ
-		 * Rx burst, use HASH miniCQE format for everything else.
-		 */
-		if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
-			mlx5_rxq_mprq_enabled(rxq_data))
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
-		else
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_HASH;
+		rxq_data->mcqe_format = priv->config.cqe_comp_fmt;
+		rxq_data->byte_mask = UINT32_MAX;
+		switch (priv->config.cqe_comp_fmt) {
+		case MLX5_CQE_RESP_FORMAT_HASH:
+			/* fallthrough */
+		case MLX5_CQE_RESP_FORMAT_CSUM:
+			/*
+			 * Select CSUM miniCQE format only for non-vectorized
+			 * MPRQ Rx burst, use HASH miniCQE format for others.
+			 */
+			if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
+			    mlx5_rxq_mprq_enabled(rxq_data))
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+			else
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_HASH;
+			rxq_data->mcqe_format = cq_attr.mini_cqe_res_format;
+			break;
+		case MLX5_CQE_RESP_FORMAT_FTAG_STRIDX:
+			rxq_data->byte_mask = MLX5_LEN_WITH_MARK_MASK;
+			/* fallthrough */
+		case MLX5_CQE_RESP_FORMAT_CSUM_STRIDX:
+			cq_attr.mini_cqe_res_format = priv->config.cqe_comp_fmt;
+			break;
+		case MLX5_CQE_RESP_FORMAT_L34H_STRIDX:
+			cq_attr.mini_cqe_res_format = 0;
+			cq_attr.mini_cqe_res_format_ext = 1;
+			break;
+		}
+		DRV_LOG(DEBUG,
+			"Port %u Rx CQE compression is enabled, format %d.",
+			dev->data->port_id, priv->config.cqe_comp_fmt);
 		/*
 		 * For vectorized Rx, it must not be doubled in order to
 		 * make cq_ci and rq_ci aligned.
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2ffacf8882..6d9add8bb8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -80,7 +80,8 @@ static uint16_t mlx5_tx_burst_##func(void *txq, \
 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
 
 static __rte_always_inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe);
 
 static __rte_always_inline int
 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
@@ -91,7 +92,8 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
 
 static __rte_always_inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe);
 
 static int
 mlx5_queue_state_modify(struct rte_eth_dev *dev,
@@ -100,12 +102,13 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev,
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum);
+			uint32_t phcsum, uint8_t l4_type);
 
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len);
+			volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len);
 
 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
@@ -813,12 +816,19 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
  *   Packet type for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe)
 {
 	uint8_t idx;
-	uint8_t pinfo = cqe->pkt_info;
-	uint16_t ptype = cqe->hdr_type_etc;
+	uint8_t ptype;
+	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
 
+	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
+	else
+		ptype = mcqe->hdr_type >> 2;
 	/*
 	 * The index to the array should have:
 	 * bit[1:0] = l3_hdr_type
@@ -827,7 +837,7 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
 	 * bit[6] = tunneled
 	 * bit[7] = outer_l3_type
 	 */
-	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+	idx = pinfo | ptype;
 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
 }
 
@@ -1131,8 +1141,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				(volatile struct mlx5_mini_cqe8 (*)[8])
 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
 							  cqe_cnt].pkt_info);
-
-			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
+					       rxq->byte_mask);
 			*mcqe = &(*mc)[zip->ai & 7];
 			if ((++zip->ai & 7) == 0) {
 				/* Invalidate consumed CQEs */
@@ -1210,7 +1220,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				--rxq->cq_ci;
 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
 				/* Get packet size to return. */
-				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
+						       rxq->byte_mask);
 				*mcqe = &(*mc)[0];
 				zip->ai = 1;
 				/* Prefetch all to be invalidated */
@@ -1274,22 +1285,42 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
  */
 static inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe)
 {
 	/* Update packet information. */
-	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
-	if (rss_hash_res && rxq->rss_hash) {
-		pkt->hash.rss = rss_hash_res;
-		pkt->ol_flags |= PKT_RX_RSS_HASH;
+	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
+
+	if (rxq->rss_hash) {
+		uint32_t rss_hash_res = 0;
+
+		/* If compressed, take hash result from mini-CQE. */
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
+			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+		else
+			rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
+		if (rss_hash_res) {
+			pkt->hash.rss = rss_hash_res;
+			pkt->ol_flags |= PKT_RX_RSS_HASH;
+		}
 	}
-	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
-		pkt->ol_flags |= PKT_RX_FDIR;
-		if (cqe->sop_drop_qpn !=
-		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
-			uint32_t mark = cqe->sop_drop_qpn;
-
-			pkt->ol_flags |= PKT_RX_FDIR_ID;
-			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+	if (rxq->mark) {
+		uint32_t mark = 0;
+
+		/* If compressed, take flow tag from mini-CQE. */
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+			mark = cqe->sop_drop_qpn;
+		else
+			mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
+				(mcqe->flow_tag_high << 16);
+		if (MLX5_FLOW_MARK_IS_VALID(mark)) {
+			pkt->ol_flags |= PKT_RX_FDIR;
+			if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) {
+				pkt->ol_flags |= PKT_RX_FDIR_ID;
+				pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+			}
 		}
 	}
 	if (rxq->dynf_meta && cqe->flow_table_metadata) {
@@ -1299,10 +1330,20 @@ rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
 	}
 	if (rxq->csum)
 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
-	if (rxq->vlan_strip &&
-	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
-		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
-		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+	if (rxq->vlan_strip) {
+		bool vlan_strip;
+
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+			vlan_strip = cqe->hdr_type_etc &
+				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
+		else
+			vlan_strip = mcqe->hdr_type &
+				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
+		if (vlan_strip) {
+			pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+		}
 	}
 	if (rxq->hw_timestamp) {
 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
@@ -1348,7 +1389,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res;
 
 		if (pkt)
 			NEXT(seg) = rep;
@@ -1387,18 +1427,14 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt = seg;
 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
 			pkt->ol_flags &= EXT_ATTACHED_MBUF;
-			/* If compressed, take hash result from mini-CQE. */
-			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
-							cqe->rx_hash_res :
-							mcqe->rx_hash_result);
-			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 			if (rxq->crc_present)
 				len -= RTE_ETHER_CRC_LEN;
 			PKT_LEN(pkt) = len;
 			if (cqe->lro_num_seg > 1) {
 				mlx5_lro_update_hdr
 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
-					 len);
+					 mcqe, rxq, len);
 				pkt->ol_flags |= PKT_RX_LRO;
 				pkt->tso_segsz = len / cqe->lro_num_seg;
 			}
@@ -1468,10 +1504,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum)
+			uint32_t phcsum, uint8_t l4_type)
 {
-	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
-			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
 	/*
 	 * The HW calculates only the TCP payload checksum, need to complete
 	 * the TCP header checksum and the L3 pseudo-header checksum.
@@ -1510,7 +1544,8 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len)
+		    volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len)
 {
 	union {
 		struct rte_ether_hdr *eth;
@@ -1524,6 +1559,7 @@ mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 	};
 	uint16_t proto = h.eth->ether_type;
 	uint32_t phcsum;
+	uint8_t l4_type;
 
 	h.eth++;
 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
@@ -1545,7 +1581,14 @@ mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
 		h.ipv6++;
 	}
-	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	else
+		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
 }
 
 void
@@ -1586,6 +1629,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct mlx5_rxq_data *rxq = dpdk_rxq;
 	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t strd_sz = 1 << rxq->strd_sz_n;
 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
@@ -1602,7 +1646,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint16_t strd_idx;
 		uint32_t byte_cnt;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res = 0;
 		enum mlx5_rqx_code rxq_code;
 
 		if (consumed_strd == strd_n) {
@@ -1618,19 +1661,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (!ret)
 			break;
 		byte_cnt = ret;
-		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
-			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
+		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+		if (rxq->crc_present)
+			len -= RTE_ETHER_CRC_LEN;
+		if (mcqe &&
+		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+			strd_cnt = (len / strd_sz) + !!(len % strd_sz);
+		else
+			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
 		MLX5_ASSERT(strd_cnt);
 		consumed_strd += strd_cnt;
 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
 			continue;
-		if (mcqe == NULL) {
-			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
-			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
-		} else {
-			/* mini-CQE for MPRQ doesn't have hash result. */
-			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
-		}
+		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
+					cqe->wqe_counter :
+					mcqe->stride_idx);
 		MLX5_ASSERT(strd_idx < strd_n);
 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
 			    wq_mask));
@@ -1656,10 +1703,10 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				break;
 			}
 		}
-		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 		if (cqe->lro_num_seg > 1) {
 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
-					    cqe, len);
+					    cqe, mcqe, rxq, len);
 			pkt->ol_flags |= PKT_RX_LRO;
 			pkt->tso_segsz = len / cqe->lro_num_seg;
 		}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3ae5e01d37..071483271c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -126,6 +126,7 @@ struct mlx5_rxq_data {
 	unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */
 	unsigned int lro:1; /* Enable LRO. */
 	unsigned int dynf_meta:1; /* Dynamic metadata is configured. */
+	unsigned int mcqe_format:3; /* Dynamic metadata is configured. */
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t port_id;
@@ -135,6 +136,7 @@ struct mlx5_rxq_data {
 	uint32_t rq_pi;
 	uint32_t cq_ci;
 	uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
+	uint32_t byte_mask;
 	union {
 		struct rxq_zip zip; /* Compressed context. */
 		uint16_t decompressed;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index cf3a795843..8fff7f729c 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -108,7 +108,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	const vector unsigned short rxdf_sel_mask =
 		(vector unsigned short){
 			0xffff, 0xffff, 0, 0, 0, 0xffff, 0, 0};
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	vector unsigned char ol_flags = (vector unsigned char){0};
+	vector unsigned char ol_flags_mask = (vector unsigned char){0};
 	unsigned int pos;
 	unsigned int i;
 	unsigned int inv = 0;
@@ -231,11 +232,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			vec_sel((vector unsigned long)shmask,
 			(vector unsigned long)invalid_mask, shmask);
 
-		mcqe1 = (vector unsigned char)
+		byte_cnt = (vector unsigned char)
+			vec_sel((vector unsigned short)
 			vec_sro((vector unsigned short)mcqe1,
 			(vector unsigned char){32}),
-		byte_cnt = (vector unsigned char)
-			vec_sel((vector unsigned short)mcqe1,
 			(vector unsigned short)mcqe2, mcqe_sel_mask);
 		byte_cnt = vec_perm(byte_cnt, zero, len_shuf_mask);
 		byte_cnt = (vector unsigned char)
@@ -255,11 +255,216 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 #endif
 
 		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+			if (rxq->mcqe_format !=
+			    MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
+				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+
+				/* E.1 store flow tag (rte_flow mark). */
+				elts[pos]->hash.fdir.hi = flow_tag;
+				elts[pos + 1]->hash.fdir.hi = flow_tag;
+				elts[pos + 2]->hash.fdir.hi = flow_tag;
+				elts[pos + 3]->hash.fdir.hi = flow_tag;
+			} else {
+				const vector unsigned char flow_mark_adj =
+					(vector unsigned char)
+					(vector unsigned int){
+					-1, -1, -1, -1};
+				const vector unsigned char flow_mark_shuf =
+					(vector unsigned char){
+					-1, -1, -1, -1,
+					-1, -1, -1, -1,
+					12,  8,  9, -1,
+					 4,  0,  1,  -1};
+				const vector unsigned char ft_mask =
+					(vector unsigned char)
+					(vector unsigned int){
+					0xffffff00, 0xffffff00,
+					0xffffff00, 0xffffff00};
+				const vector unsigned char fdir_flags =
+					(vector unsigned char)
+					(vector unsigned int){
+					PKT_RX_FDIR, PKT_RX_FDIR,
+					PKT_RX_FDIR, PKT_RX_FDIR};
+				const vector unsigned char fdir_all_flags =
+					(vector unsigned char)
+					(vector unsigned int){
+					PKT_RX_FDIR | PKT_RX_FDIR_ID,
+					PKT_RX_FDIR | PKT_RX_FDIR_ID,
+					PKT_RX_FDIR | PKT_RX_FDIR_ID,
+					PKT_RX_FDIR | PKT_RX_FDIR_ID};
+				vector unsigned char fdir_id_flags =
+					(vector unsigned char)
+					(vector unsigned int){
+					PKT_RX_FDIR_ID, PKT_RX_FDIR_ID,
+					PKT_RX_FDIR_ID, PKT_RX_FDIR_ID};
+				/* Extract flow_tag field. */
+				vector unsigned char ftag0 = vec_perm(mcqe1,
+							zero, flow_mark_shuf);
+				vector unsigned char ftag1 = vec_perm(mcqe2,
+							zero, flow_mark_shuf);
+				vector unsigned char ftag =
+					(vector unsigned char)
+					vec_mergel((vector unsigned int)ftag0,
+					(vector unsigned int)ftag1);
+				vector unsigned char invalid_mask =
+					(vector unsigned char)
+					vec_cmpeq((vector unsigned int)ftag,
+					(vector unsigned int)zero);
+
+				ol_flags_mask = (vector unsigned char)
+					vec_or((vector unsigned long)
+					ol_flags_mask,
+					(vector unsigned long)fdir_all_flags);
+
+				/* Set PKT_RX_FDIR if flow tag is non-zero. */
+				invalid_mask = (vector unsigned char)
+					vec_cmpeq((vector unsigned int)ftag,
+					(vector unsigned int)zero);
+				ol_flags = (vector unsigned char)
+					vec_or((vector unsigned long)ol_flags,
+					(vector unsigned long)
+					vec_andc((vector unsigned long)
+					fdir_flags,
+					(vector unsigned long)invalid_mask));
+				ol_flags_mask = (vector unsigned char)
+					vec_or((vector unsigned long)
+					ol_flags_mask,
+					(vector unsigned long)fdir_flags);
+
+				/* Mask out invalid entries. */
+				fdir_id_flags = (vector unsigned char)
+					vec_andc((vector unsigned long)
+					fdir_id_flags,
+					(vector unsigned long)invalid_mask);
+
+				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+				ol_flags = (vector unsigned char)
+					vec_or((vector unsigned long)ol_flags,
+					(vector unsigned long)
+					vec_andc((vector unsigned long)
+					fdir_id_flags,
+					(vector unsigned long)
+					vec_cmpeq((vector unsigned int)ftag,
+					(vector unsigned int)ft_mask)));
+
+				ftag = (vector unsigned char)
+					((vector unsigned int)ftag +
+					(vector unsigned int)flow_mark_adj);
+				elts[pos]->hash.fdir.hi =
+					((vector unsigned int)ftag)[0];
+				elts[pos + 1]->hash.fdir.hi =
+					((vector unsigned int)ftag)[1];
+				elts[pos + 2]->hash.fdir.hi =
+					((vector unsigned int)ftag)[2];
+				elts[pos + 3]->hash.fdir.hi =
+					((vector unsigned int)ftag)[3];
+			}
+		}
+		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
+			if (rxq->mcqe_format ==
+			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+				const uint8_t pkt_info =
+					(cq->pkt_info & 0x3) << 6;
+				const uint8_t pkt_hdr0 =
+					mcq[pos % 8].hdr_type;
+				const uint8_t pkt_hdr1 =
+					mcq[pos % 8 + 1].hdr_type;
+				const uint8_t pkt_hdr2 =
+					mcq[pos % 8 + 2].hdr_type;
+				const uint8_t pkt_hdr3 =
+					mcq[pos % 8 + 3].hdr_type;
+				const vector unsigned char vlan_mask =
+					(vector unsigned char)
+					(vector unsigned int) {
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED)};
+				const vector unsigned char cv_mask =
+					(vector unsigned char)
+					(vector unsigned int) {
+					MLX5_CQE_VLAN_STRIPPED,
+					MLX5_CQE_VLAN_STRIPPED,
+					MLX5_CQE_VLAN_STRIPPED,
+					MLX5_CQE_VLAN_STRIPPED};
+				vector unsigned char pkt_cv =
+					(vector unsigned char)
+					(vector unsigned int) {
+					pkt_hdr0 & 0x1, pkt_hdr1 & 0x1,
+					pkt_hdr2 & 0x1, pkt_hdr3 & 0x1};
+
+				ol_flags_mask = (vector unsigned char)
+					vec_or((vector unsigned long)
+					ol_flags_mask,
+					(vector unsigned long)vlan_mask);
+				ol_flags = (vector unsigned char)
+					vec_or((vector unsigned long)ol_flags,
+					(vector unsigned long)
+					vec_and((vector unsigned long)vlan_mask,
+					(vector unsigned long)
+					vec_cmpeq((vector unsigned int)pkt_cv,
+					(vector unsigned int)cv_mask)));
+				elts[pos]->packet_type =
+					mlx5_ptype_table[(pkt_hdr0 >> 2) |
+							 pkt_info];
+				elts[pos + 1]->packet_type =
+					mlx5_ptype_table[(pkt_hdr1 >> 2) |
+							 pkt_info];
+				elts[pos + 2]->packet_type =
+					mlx5_ptype_table[(pkt_hdr2 >> 2) |
+							 pkt_info];
+				elts[pos + 3]->packet_type =
+					mlx5_ptype_table[(pkt_hdr3 >> 2) |
+							 pkt_info];
+				if (rxq->tunnel) {
+					elts[pos]->packet_type |=
+						!!(((pkt_hdr0 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 1]->packet_type |=
+						!!(((pkt_hdr1 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 2]->packet_type |=
+						!!(((pkt_hdr2 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 3]->packet_type |=
+						!!(((pkt_hdr3 >> 2) |
+						pkt_info) & (1 << 6));
+				}
+			}
+			const vector unsigned char hash_mask =
+				(vector unsigned char)(vector unsigned int) {
+					PKT_RX_RSS_HASH,
+					PKT_RX_RSS_HASH,
+					PKT_RX_RSS_HASH,
+					PKT_RX_RSS_HASH};
+			const vector unsigned char rearm_flags =
+				(vector unsigned char)(vector unsigned int) {
+				(uint32_t)t_pkt->ol_flags,
+				(uint32_t)t_pkt->ol_flags,
+				(uint32_t)t_pkt->ol_flags,
+				(uint32_t)t_pkt->ol_flags};
+
+			ol_flags_mask = (vector unsigned char)
+				vec_or((vector unsigned long)ol_flags_mask,
+				(vector unsigned long)hash_mask);
+			ol_flags = (vector unsigned char)
+				vec_or((vector unsigned long)ol_flags,
+				(vector unsigned long)
+				vec_andc((vector unsigned long)rearm_flags,
+				(vector unsigned long)ol_flags_mask));
+
+			elts[pos]->ol_flags =
+				((vector unsigned int)ol_flags)[0];
+			elts[pos + 1]->ol_flags =
+				((vector unsigned int)ol_flags)[1];
+			elts[pos + 2]->ol_flags =
+				((vector unsigned int)ol_flags)[2];
+			elts[pos + 3]->ol_flags =
+				((vector unsigned int)ol_flags)[3];
+			elts[pos]->hash.rss = 0;
+			elts[pos + 1]->hash.rss = 0;
+			elts[pos + 2]->hash.rss = 0;
+			elts[pos + 3]->hash.rss = 0;
 		}
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 47b6692942..d5fe00857c 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -111,7 +111,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
 		0, 0
 	};
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	uint32x4_t ol_flags = {0, 0, 0, 0};
+	uint32x4_t ol_flags_mask = {0, 0, 0, 0};
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	uint32_t rcvd_byte = 0;
 #endif
@@ -198,11 +199,139 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
 #endif
 		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+			if (rxq->mcqe_format !=
+			    MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
+				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+
+				/* E.1 store flow tag (rte_flow mark). */
+				elts[pos]->hash.fdir.hi = flow_tag;
+				elts[pos + 1]->hash.fdir.hi = flow_tag;
+				elts[pos + 2]->hash.fdir.hi = flow_tag;
+				elts[pos + 3]->hash.fdir.hi = flow_tag;
+			}  else {
+				const uint32x4_t flow_mark_adj = {
+					-1, -1, -1, -1 };
+				const uint8x16_t flow_mark_shuf = {
+					28, 24, 25, -1,
+					20, 16, 17, -1,
+					12,  8,  9, -1,
+					 4,  0,  1, -1};
+				/* Extract flow_tag field. */
+				const uint32x4_t ft_mask =
+					vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT);
+				const uint32x4_t fdir_flags =
+					vdupq_n_u32(PKT_RX_FDIR);
+				const uint32x4_t fdir_all_flags =
+					vdupq_n_u32(PKT_RX_FDIR |
+						    PKT_RX_FDIR_ID);
+				uint32x4_t fdir_id_flags =
+					vdupq_n_u32(PKT_RX_FDIR_ID);
+				uint32x4_t invalid_mask, ftag;
+
+				__asm__ volatile
+				/* A.1 load mCQEs into a 128bit register. */
+				("ld1 {v16.16b - v17.16b}, [%[mcq]]\n\t"
+				/* Extract flow_tag. */
+				 "tbl %[ftag].16b, {v16.16b - v17.16b}, %[flow_mark_shuf].16b\n\t"
+				: [ftag]"=&w"(ftag)
+				: [mcq]"r"(p),
+				  [flow_mark_shuf]"w"(flow_mark_shuf)
+				: "memory", "v16", "v17");
+				invalid_mask = vceqzq_u32(ftag);
+				ol_flags_mask = vorrq_u32(ol_flags_mask,
+							  fdir_all_flags);
+				/* Set PKT_RX_FDIR if flow tag is non-zero. */
+				ol_flags = vorrq_u32(ol_flags,
+					vbicq_u32(fdir_flags, invalid_mask));
+				/* Mask out invalid entries. */
+				fdir_id_flags = vbicq_u32(fdir_id_flags,
+							  invalid_mask);
+				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+				ol_flags = vorrq_u32(ol_flags,
+					vbicq_u32(fdir_id_flags,
+						  vceqq_u32(ftag, ft_mask)));
+				ftag = vaddq_u32(ftag, flow_mark_adj);
+				elts[pos]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 3);
+				elts[pos + 1]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 2);
+				elts[pos + 2]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 1);
+				elts[pos + 3]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 0);
+				}
+		}
+		if (unlikely(rxq->mcqe_format !=
+			     MLX5_CQE_RESP_FORMAT_HASH)) {
+			if (rxq->mcqe_format ==
+			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+				const uint8_t pkt_info =
+					(cq->pkt_info & 0x3) << 6;
+				const uint8_t pkt_hdr0 =
+					mcq[pos % 8].hdr_type;
+				const uint8_t pkt_hdr1 =
+					mcq[pos % 8 + 1].hdr_type;
+				const uint8_t pkt_hdr2 =
+					mcq[pos % 8 + 2].hdr_type;
+				const uint8_t pkt_hdr3 =
+					mcq[pos % 8 + 3].hdr_type;
+				const uint32x4_t vlan_mask =
+					vdupq_n_u32(PKT_RX_VLAN |
+						    PKT_RX_VLAN_STRIPPED);
+				const uint32x4_t cv_mask =
+					vdupq_n_u32(MLX5_CQE_VLAN_STRIPPED);
+				const uint32x4_t pkt_cv = {
+					pkt_hdr0 & 0x1, pkt_hdr1 & 0x1,
+					pkt_hdr2 & 0x1, pkt_hdr3 & 0x1};
+
+				ol_flags_mask = vorrq_u32(ol_flags_mask,
+							  vlan_mask);
+				ol_flags = vorrq_u32(ol_flags,
+						vandq_u32(vlan_mask,
+						vceqq_u32(pkt_cv, cv_mask)));
+				elts[pos]->packet_type =
+					mlx5_ptype_table[(pkt_hdr0 >> 2) |
+							 pkt_info];
+				elts[pos + 1]->packet_type =
+					mlx5_ptype_table[(pkt_hdr1 >> 2) |
+							 pkt_info];
+				elts[pos + 2]->packet_type =
+					mlx5_ptype_table[(pkt_hdr2 >> 2) |
+							 pkt_info];
+				elts[pos + 3]->packet_type =
+					mlx5_ptype_table[(pkt_hdr3 >> 2) |
+							 pkt_info];
+				if (rxq->tunnel) {
+					elts[pos]->packet_type |=
+						!!(((pkt_hdr0 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 1]->packet_type |=
+						!!(((pkt_hdr1 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 2]->packet_type |=
+						!!(((pkt_hdr2 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 3]->packet_type |=
+						!!(((pkt_hdr3 >> 2) |
+						pkt_info) & (1 << 6));
+				}
+			}
+			const uint32x4_t hash_flags =
+				vdupq_n_u32(PKT_RX_RSS_HASH);
+			const uint32x4_t rearm_flags =
+				vdupq_n_u32((uint32_t)t_pkt->ol_flags);
+
+			ol_flags_mask = vorrq_u32(ol_flags_mask, hash_flags);
+			ol_flags = vorrq_u32(ol_flags,
+					vbicq_u32(rearm_flags, ol_flags_mask));
+			elts[pos]->ol_flags = vgetq_lane_u32(ol_flags, 3);
+			elts[pos + 1]->ol_flags = vgetq_lane_u32(ol_flags, 2);
+			elts[pos + 2]->ol_flags = vgetq_lane_u32(ol_flags, 1);
+			elts[pos + 3]->ol_flags = vgetq_lane_u32(ol_flags, 0);
+			elts[pos]->hash.rss = 0;
+			elts[pos + 1]->hash.rss = 0;
+			elts[pos + 2]->hash.rss = 0;
+			elts[pos + 3]->hash.rss = 0;
 		}
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 59662fa12d..732e5859a4 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -104,7 +104,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			      0,
 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
 			      0, 0);
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	__m128i ol_flags = _mm_setzero_si128();
+	__m128i ol_flags_mask = _mm_setzero_si128();
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	const __m128i zero = _mm_setzero_si128();
 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
@@ -175,19 +176,152 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 					      (mcqe_n - pos) *
 					      sizeof(uint16_t) * 8);
 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
-		mcqe1 = _mm_srli_si128(mcqe1, 4);
-		byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc);
+		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
+					   mcqe2, 0xcc);
 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
 #endif
 		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+			if (rxq->mcqe_format !=
+				MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
+				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+
+				/* E.1 store flow tag (rte_flow mark). */
+				elts[pos]->hash.fdir.hi = flow_tag;
+				elts[pos + 1]->hash.fdir.hi = flow_tag;
+				elts[pos + 2]->hash.fdir.hi = flow_tag;
+				elts[pos + 3]->hash.fdir.hi = flow_tag;
+			} else {
+				const __m128i flow_mark_adj =
+					_mm_set_epi32(-1, -1, -1, -1);
+				const __m128i flow_mark_shuf =
+					_mm_set_epi8(-1,  1,  0,  4,
+						     -1,  9,  8, 12,
+						     -1, -1, -1, -1,
+						     -1, -1, -1, -1);
+				const __m128i ft_mask =
+					_mm_set1_epi32(0xffffff00);
+				const __m128i fdir_flags =
+					_mm_set1_epi32(PKT_RX_FDIR);
+				const __m128i fdir_all_flags =
+					_mm_set1_epi32(PKT_RX_FDIR |
+						       PKT_RX_FDIR_ID);
+				__m128i fdir_id_flags =
+					_mm_set1_epi32(PKT_RX_FDIR_ID);
+
+				/* Extract flow_tag field. */
+				__m128i ftag0 =
+					_mm_shuffle_epi8(mcqe1, flow_mark_shuf);
+				__m128i ftag1 =
+					_mm_shuffle_epi8(mcqe2, flow_mark_shuf);
+				__m128i ftag =
+					_mm_unpackhi_epi64(ftag0, ftag1);
+				__m128i invalid_mask =
+					_mm_cmpeq_epi32(ftag, zero);
+
+				ol_flags_mask = _mm_or_si128(ol_flags_mask,
+							     fdir_all_flags);
+				/* Set PKT_RX_FDIR if flow tag is non-zero. */
+				ol_flags = _mm_or_si128(ol_flags,
+					_mm_andnot_si128(invalid_mask,
+							 fdir_flags));
+				/* Mask out invalid entries. */
+				fdir_id_flags = _mm_andnot_si128(invalid_mask,
+								 fdir_id_flags);
+				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+				ol_flags = _mm_or_si128(ol_flags,
+					_mm_andnot_si128(_mm_cmpeq_epi32(ftag,
+							 ft_mask),
+					fdir_id_flags));
+				ftag = _mm_add_epi32(ftag, flow_mark_adj);
+				elts[pos]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 0);
+				elts[pos + 1]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 1);
+				elts[pos + 2]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 2);
+				elts[pos + 3]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 3);
+			}
+		}
+		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
+			if (rxq->mcqe_format ==
+			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+				const uint8_t pkt_info =
+					(cq->pkt_info & 0x3) << 6;
+				const uint8_t pkt_hdr0 =
+					_mm_extract_epi8(mcqe1, 0);
+				const uint8_t pkt_hdr1 =
+					_mm_extract_epi8(mcqe1, 8);
+				const uint8_t pkt_hdr2 =
+					_mm_extract_epi8(mcqe2, 0);
+				const uint8_t pkt_hdr3 =
+					_mm_extract_epi8(mcqe2, 8);
+				const __m128i vlan_mask =
+					_mm_set1_epi32(PKT_RX_VLAN |
+						       PKT_RX_VLAN_STRIPPED);
+				const __m128i cv_mask =
+					_mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
+				const __m128i pkt_cv =
+					_mm_set_epi32(pkt_hdr0 & 0x1,
+						      pkt_hdr1 & 0x1,
+						      pkt_hdr2 & 0x1,
+						      pkt_hdr3 & 0x1);
+
+				ol_flags_mask = _mm_or_si128(ol_flags_mask,
+							     vlan_mask);
+				ol_flags = _mm_or_si128(ol_flags,
+					_mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
+					cv_mask), vlan_mask));
+				elts[pos]->packet_type =
+					mlx5_ptype_table[(pkt_hdr0 >> 2) |
+							 pkt_info];
+				elts[pos + 1]->packet_type =
+					mlx5_ptype_table[(pkt_hdr1 >> 2) |
+							 pkt_info];
+				elts[pos + 2]->packet_type =
+					mlx5_ptype_table[(pkt_hdr2 >> 2) |
+							 pkt_info];
+				elts[pos + 3]->packet_type =
+					mlx5_ptype_table[(pkt_hdr3 >> 2) |
+							 pkt_info];
+				if (rxq->tunnel) {
+					elts[pos]->packet_type |=
+						!!(((pkt_hdr0 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 1]->packet_type |=
+						!!(((pkt_hdr1 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 2]->packet_type |=
+						!!(((pkt_hdr2 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 3]->packet_type |=
+						!!(((pkt_hdr3 >> 2) |
+						pkt_info) & (1 << 6));
+				}
+			}
+			const __m128i hash_flags =
+				_mm_set1_epi32(PKT_RX_RSS_HASH);
+			const __m128i rearm_flags =
+				_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
+
+			ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
+			ol_flags = _mm_or_si128(ol_flags,
+				_mm_andnot_si128(ol_flags_mask, rearm_flags));
+			elts[pos]->ol_flags =
+				_mm_extract_epi32(ol_flags, 0);
+			elts[pos + 1]->ol_flags =
+				_mm_extract_epi32(ol_flags, 1);
+			elts[pos + 2]->ol_flags =
+				_mm_extract_epi32(ol_flags, 2);
+			elts[pos + 3]->ol_flags =
+				_mm_extract_epi32(ol_flags, 3);
+			elts[pos]->hash.rss = 0;
+			elts[pos + 1]->hash.rss = 0;
+			elts[pos + 2]->hash.rss = 0;
+			elts[pos + 3]->hash.rss = 0;
 		}
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
@@ -251,12 +385,9 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 					  rxq->hw_timestamp * PKT_RX_TIMESTAMP);
 	__m128i cv_flags;
 	const __m128i zero = _mm_setzero_si128();
-	const __m128i ptype_mask =
-		_mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
-	const __m128i ptype_ol_mask =
-		_mm_set_epi32(0x106, 0x106, 0x106, 0x106);
-	const __m128i pinfo_mask =
-		_mm_set_epi32(0x3, 0x3, 0x3, 0x3);
+	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
+	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
+	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
 	const __m128i cv_flag_sel =
 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
 			     (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
@@ -268,13 +399,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 			     (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
 			     0);
 	const __m128i cv_mask =
-		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		_mm_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
 	const __m128i mbuf_init =
 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/mlx5: support Flow Tag and Packet Header miniCQEs
  2020-11-01 16:27 ` [dpdk-dev] [PATCH v2] " Alexander Kozyrev
@ 2020-11-01 16:38   ` Slava Ovsiienko
  0 siblings, 0 replies; 5+ messages in thread
From: Slava Ovsiienko @ 2020-11-01 16:38 UTC (permalink / raw)
  To: Alexander Kozyrev, dev; +Cc: Raslan Darawsheh, Matan Azrad

> -----Original Message-----
> From: Alexander Kozyrev <akozyrev@nvidia.com>
> Sent: Sunday, November 1, 2020 18:28
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Matan Azrad <matan@nvidia.com>
> Subject: [PATCH v2] net/mlx5: support Flow Tag and Packet Header miniCQEs
> 
> CQE compression allows us to save the PCI bandwidth and improve the
> performance by compressing several CQEs together to a miniCQE.
> But the miniCQE size is only 8 bytes and this limits the ability to successfully
> keep the compression session in case of various traffic patterns.
> 
> The current miniCQE format only keeps the compression session alive in case
> of uniform traffic with the Hash RSS as the only difference.
> There are requests to keep the compression session in case of tagged traffic by
> RTE Flow Mark Id and mixed UDP/TCP and IPv4/IPv6 traffic.
> Add 2 new miniCQE formats in order to achieve the best performance for
> these traffic patterns: Flow Tag and Packet Header miniCQEs.
> 
> The existing rxq_cqe_comp_en devarg is modified to specify the desired
> miniCQE format. Specifying 2 selects Flow Tag format for better compression
> rate in case of RTE Flow Mark traffic.
> Specifying 3 selects Checksum format (existing format for MPRQ).
> Specifying 4 selects L3/L4 Header format for better compression rate in case of
> mixed TCP/UDP and IPv4/IPv6 traffic.
> 
> Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/mlx5: support Flow Tag and Packet Header miniCQEs
  2020-11-01 16:14 Alexander Kozyrev
@ 2020-11-01 23:56 ` Raslan Darawsheh
  0 siblings, 0 replies; 5+ messages in thread
From: Raslan Darawsheh @ 2020-11-01 23:56 UTC (permalink / raw)
  To: Alexander Kozyrev, dev; +Cc: Slava Ovsiienko, Matan Azrad

Hi,

> -----Original Message-----
> From: Alexander Kozyrev <akozyrev@nvidia.com>
> Sent: Sunday, November 1, 2020 6:15 PM
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Matan Azrad <matan@nvidia.com>
> Subject: [PATCH v2] net/mlx5: support Flow Tag and Packet Header miniCQEs
> 
> CQE compression allows us to save the PCI bandwidth and improve
> the performance by compressing several CQEs together to a miniCQE.
> But the miniCQE size is only 8 bytes and this limits the ability
> to successfully keep the compression session in case of various
> traffic patterns.
> 
> The current miniCQE format only keeps the compression session alive
> in case of uniform traffic with the Hash RSS as the only difference.
> There are requests to keep the compression session in case of tagged
> traffic by RTE Flow Mark Id and mixed UDP/TCP and IPv4/IPv6 traffic.
> Add 2 new miniCQE formats in order to achieve the best performance
> for these traffic patterns: Flow Tag and Packet Header miniCQEs.
> 
> The existing rxq_cqe_comp_en devarg is modified to specify the
> desired miniCQE format. Specifying 2 selects Flow Tag format
> for better compression rate in case of RTE Flow Mark traffic.
> Specifying 3 selects Checksum format (existing format for MPRQ).
> Specifying 4 selects L3/L4 Header format for better compression
> rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.
> 
> Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
> ---
> v1:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatc
> hwork.dpdk.org%2Fpatch%2F82500%2F&amp;data=04%7C01%7Crasland%40
> nvidia.com%7C396c0aafa96b4d3bd57808d87e813f93%7C43083d15727340c1b7
> db39efd9ccc17a%7C0%7C0%7C637398440875666226%7CUnknown%7CTWFpb
> GZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI
> 6Mn0%3D%7C1000&amp;sdata=DuD7YpUNTqLAqTrcRVaGSm%2F4F6gtWK1I
> q8Zh6lcvwtA%3D&amp;reserved=0
>     added ARM and PowerPC support, reworked x86 code
> ---
>  doc/guides/nics/mlx5.rst                 |   8 +
>  doc/guides/rel_notes/release_20_11.rst   |   2 +
>  drivers/common/mlx5/mlx5_devx_cmds.c     |   7 +-
>  drivers/common/mlx5/mlx5_devx_cmds.h     |   1 +
>  drivers/common/mlx5/mlx5_prm.h           |  27 ++-
>  drivers/net/mlx5/mlx5.c                  |   7 +
>  drivers/net/mlx5/mlx5.h                  |   1 +
>  drivers/net/mlx5/mlx5_devx.c             |  44 +++--
>  drivers/net/mlx5/mlx5_rxtx.c             | 151 +++++++++------
>  drivers/net/mlx5/mlx5_rxtx.h             |   2 +
>  drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 223
> ++++++++++++++++++++++-
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 141 +++++++++++++-
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 167 ++++++++++++++---
>  13 files changed, 675 insertions(+), 106 deletions(-)
> 

Patch applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH v2] net/mlx5: support Flow Tag and Packet Header miniCQEs
@ 2020-11-01 16:14 Alexander Kozyrev
  2020-11-01 23:56 ` Raslan Darawsheh
  0 siblings, 1 reply; 5+ messages in thread
From: Alexander Kozyrev @ 2020-11-01 16:14 UTC (permalink / raw)
  To: dev; +Cc: rasland, viacheslavo, matan

CQE compression allows us to save the PCI bandwidth and improve
the performance by compressing several CQEs together to a miniCQE.
But the miniCQE size is only 8 bytes and this limits the ability
to successfully keep the compression session in case of various
traffic patterns.

The current miniCQE format only keeps the compression session alive
in case of uniform traffic with the Hash RSS as the only difference.
There are requests to keep the compression session in case of tagged
traffic by RTE Flow Mark Id and mixed UDP/TCP and IPv4/IPv6 traffic.
Add 2 new miniCQE formats in order to achieve the best performance
for these traffic patterns: Flow Tag and Packet Header miniCQEs.

The existing rxq_cqe_comp_en devarg is modified to specify the
desired miniCQE format. Specifying 2 selects Flow Tag format
for better compression rate in case of RTE Flow Mark traffic.
Specifying 3 selects Checksum format (existing format for MPRQ).
Specifying 4 selects L3/L4 Header format for better compression
rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
v1: https://patchwork.dpdk.org/patch/82500/
    added ARM and PowerPC support, reworked x86 code
---
 doc/guides/nics/mlx5.rst                 |   8 +
 doc/guides/rel_notes/release_20_11.rst   |   2 +
 drivers/common/mlx5/mlx5_devx_cmds.c     |   7 +-
 drivers/common/mlx5/mlx5_devx_cmds.h     |   1 +
 drivers/common/mlx5/mlx5_prm.h           |  27 ++-
 drivers/net/mlx5/mlx5.c                  |   7 +
 drivers/net/mlx5/mlx5.h                  |   1 +
 drivers/net/mlx5/mlx5_devx.c             |  44 +++--
 drivers/net/mlx5/mlx5_rxtx.c             | 151 +++++++++------
 drivers/net/mlx5/mlx5_rxtx.h             |   2 +
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 223 ++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 141 +++++++++++++-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 167 ++++++++++++++---
 13 files changed, 675 insertions(+), 106 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 8c59cc6a60..5d3d0c4440 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -432,6 +432,14 @@ Driver options
 
   A nonzero value enables the compression of CQE on RX side. This feature
   allows to save PCI bandwidth and improve performance. Enabled by default.
+  Different compression formats are supported in order to achieve the best
+  performance for different traffic patterns. Hash RSS format is the default.
+
+  Specifying 2 as a ``rxq_cqe_comp_en`` value selects Flow Tag format for
+  better compression rate in case of RTE Flow Mark traffic.
+  Specifying 3 as a ``rxq_cqe_comp_en`` value selects Checksum format.
+  Specifying 4 as a ``rxq_cqe_comp_en`` value selects L3/L4 Header format for
+  better compression rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.
 
   Supported on:
 
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index eb5908fd37..36d8ff6219 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -231,6 +231,8 @@ New Features
   * Added support for the new vlan fields ``has_vlan`` in the eth item and
     ``has_more_vlan`` in the vlan item.
   * Added support for PMD level multiple-thread flow insertion.
+  * Added vectorized Multi-Packet Rx Queue burst.
+  * Added support for 2 new miniCQE formats: Flow Tag and L3/L4 header.
 
   Updated Mellanox mlx5 vDPA driver:
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index a13febebfd..e1ac62a352 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1565,8 +1565,11 @@ mlx5_devx_cmd_create_cq(void *ctx, struct mlx5_devx_cq_attr *attr)
 		 MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET(cqc, cqctx, c_eqn, attr->eqn);
 	MLX5_SET(cqc, cqctx, uar_page, attr->uar_page_id);
-	MLX5_SET(cqc, cqctx, cqe_comp_en, attr->cqe_comp_en);
-	MLX5_SET(cqc, cqctx, mini_cqe_res_format, attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, cqe_comp_en, !!attr->cqe_comp_en);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format,
+		 attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format_ext,
+		 attr->mini_cqe_res_format_ext);
 	MLX5_SET(cqc, cqctx, cqe_sz, attr->cqe_size);
 	if (attr->q_umem_valid) {
 		MLX5_SET(create_cq_in, in, cq_umem_valid, attr->q_umem_valid);
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index 472d1cb658..7feaafacf1 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -256,6 +256,7 @@ struct mlx5_devx_cq_attr {
 	uint32_t overrun_ignore:1;
 	uint32_t cqe_comp_en:1;
 	uint32_t mini_cqe_res_format:2;
+	uint32_t mini_cqe_res_format_ext:2;
 	uint32_t cqe_size:3;
 	uint32_t log_cq_size:5;
 	uint32_t log_page_size:5;
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 7d671a3996..d8437d13f6 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -239,6 +239,9 @@
 /* Default mark mask for metadata legacy mode. */
 #define MLX5_FLOW_MARK_MASK 0xffffff
 
+/* Byte length mask when mark is enable in miniCQE */
+#define MLX5_LEN_WITH_MARK_MASK 0xffffff00
+
 /* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
@@ -2152,11 +2155,14 @@ struct mlx5_ifc_cqc_bits {
 	u8 cqe_comp_en[0x1];
 	u8 mini_cqe_res_format[0x2];
 	u8 st[0x4];
-	u8 reserved_at_18[0x8];
+	u8 reserved_at_18[0x1];
+	u8 cqe_comp_layout[0x7];
 	u8 dbr_umem_id[0x20];
 	u8 reserved_at_40[0x14];
 	u8 page_offset[0x6];
-	u8 reserved_at_5a[0x6];
+	u8 reserved_at_5a[0x2];
+	u8 mini_cqe_res_format_ext[0x2];
+	u8 cq_timestamp_format[0x2];
 	u8 reserved_at_60[0x3];
 	u8 log_cq_size[0x5];
 	u8 uar_page[0x18];
@@ -2918,7 +2924,14 @@ struct mlx5_mini_cqe8 {
 	union {
 		uint32_t rx_hash_result;
 		struct {
-			uint16_t checksum;
+			union {
+				uint16_t checksum;
+				uint16_t flow_tag_high;
+				struct {
+					uint8_t reserved;
+					uint8_t hdr_type;
+				};
+			};
 			uint16_t stride_idx;
 		};
 		struct {
@@ -2927,15 +2940,19 @@ struct mlx5_mini_cqe8 {
 			uint8_t  reserved;
 		} s_wqe_info;
 	};
-	uint32_t byte_cnt;
+	union {
+		uint32_t byte_cnt_flow;
+		uint32_t byte_cnt;
+	};
 };
 
 /* Mini CQE responder format. */
 enum {
 	MLX5_CQE_RESP_FORMAT_HASH = 0x0,
 	MLX5_CQE_RESP_FORMAT_CSUM = 0x1,
-	MLX5_CQE_RESP_FORMAT_CSUM_FLOW_TAG = 0x2,
+	MLX5_CQE_RESP_FORMAT_FTAG_STRIDX = 0x2,
 	MLX5_CQE_RESP_FORMAT_CSUM_STRIDX = 0x3,
+	MLX5_CQE_RESP_FORMAT_L34H_STRIDX = 0x4,
 };
 
 /* srTCM PRM flow meter parameters. */
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 862bd401d9..67a530d85a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1361,7 +1361,14 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	}
 	mod = tmp >= 0 ? tmp : -tmp;
 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+		if (tmp > MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+			DRV_LOG(ERR, "invalid CQE compression "
+				     "format parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
 		config->cqe_comp = !!tmp;
+		config->cqe_comp_fmt = tmp;
 	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
 		config->cqe_pad = !!tmp;
 	} else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index b080426b72..265e8f09ee 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -205,6 +205,7 @@ struct mlx5_dev_config {
 	/* Whether tunnel stateless offloads are supported. */
 	unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
+	unsigned int cqe_comp_fmt:3; /* CQE compression format. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 5fce4cd555..b3acbc24fc 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -437,17 +437,39 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)
 	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
 	    !rxq_data->lro) {
 		cq_attr.cqe_comp_en = 1u;
-		/*
-		 * Select CSUM miniCQE format only for non-vectorized MPRQ
-		 * Rx burst, use HASH miniCQE format for everything else.
-		 */
-		if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
-			mlx5_rxq_mprq_enabled(rxq_data))
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
-		else
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_HASH;
+		rxq_data->mcqe_format = priv->config.cqe_comp_fmt;
+		rxq_data->byte_mask = UINT32_MAX;
+		switch (priv->config.cqe_comp_fmt) {
+		case MLX5_CQE_RESP_FORMAT_HASH:
+			/* fallthrough */
+		case MLX5_CQE_RESP_FORMAT_CSUM:
+			/*
+			 * Select CSUM miniCQE format only for non-vectorized
+			 * MPRQ Rx burst, use HASH miniCQE format for others.
+			 */
+			if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
+			    mlx5_rxq_mprq_enabled(rxq_data))
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+			else
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_HASH;
+			rxq_data->mcqe_format = cq_attr.mini_cqe_res_format;
+			break;
+		case MLX5_CQE_RESP_FORMAT_FTAG_STRIDX:
+			rxq_data->byte_mask = MLX5_LEN_WITH_MARK_MASK;
+			/* fallthrough */
+		case MLX5_CQE_RESP_FORMAT_CSUM_STRIDX:
+			cq_attr.mini_cqe_res_format = priv->config.cqe_comp_fmt;
+			break;
+		case MLX5_CQE_RESP_FORMAT_L34H_STRIDX:
+			cq_attr.mini_cqe_res_format = 0;
+			cq_attr.mini_cqe_res_format_ext = 1;
+			break;
+		}
+		DRV_LOG(DEBUG,
+			"Port %u Rx CQE compression is enabled, format %d.",
+			dev->data->port_id, priv->config.cqe_comp_fmt);
 		/*
 		 * For vectorized Rx, it must not be doubled in order to
 		 * make cq_ci and rq_ci aligned.
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2ffacf8882..6d9add8bb8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -80,7 +80,8 @@ static uint16_t mlx5_tx_burst_##func(void *txq, \
 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
 
 static __rte_always_inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe);
 
 static __rte_always_inline int
 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
@@ -91,7 +92,8 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
 
 static __rte_always_inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe);
 
 static int
 mlx5_queue_state_modify(struct rte_eth_dev *dev,
@@ -100,12 +102,13 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev,
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum);
+			uint32_t phcsum, uint8_t l4_type);
 
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len);
+			volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len);
 
 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
@@ -813,12 +816,19 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
  *   Packet type for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe)
 {
 	uint8_t idx;
-	uint8_t pinfo = cqe->pkt_info;
-	uint16_t ptype = cqe->hdr_type_etc;
+	uint8_t ptype;
+	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
 
+	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
+	else
+		ptype = mcqe->hdr_type >> 2;
 	/*
 	 * The index to the array should have:
 	 * bit[1:0] = l3_hdr_type
@@ -827,7 +837,7 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
 	 * bit[6] = tunneled
 	 * bit[7] = outer_l3_type
 	 */
-	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+	idx = pinfo | ptype;
 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
 }
 
@@ -1131,8 +1141,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				(volatile struct mlx5_mini_cqe8 (*)[8])
 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
 							  cqe_cnt].pkt_info);
-
-			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
+					       rxq->byte_mask);
 			*mcqe = &(*mc)[zip->ai & 7];
 			if ((++zip->ai & 7) == 0) {
 				/* Invalidate consumed CQEs */
@@ -1210,7 +1220,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				--rxq->cq_ci;
 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
 				/* Get packet size to return. */
-				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
+						       rxq->byte_mask);
 				*mcqe = &(*mc)[0];
 				zip->ai = 1;
 				/* Prefetch all to be invalidated */
@@ -1274,22 +1285,42 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
  */
 static inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe)
 {
 	/* Update packet information. */
-	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
-	if (rss_hash_res && rxq->rss_hash) {
-		pkt->hash.rss = rss_hash_res;
-		pkt->ol_flags |= PKT_RX_RSS_HASH;
+	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
+
+	if (rxq->rss_hash) {
+		uint32_t rss_hash_res = 0;
+
+		/* If compressed, take hash result from mini-CQE. */
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
+			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+		else
+			rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
+		if (rss_hash_res) {
+			pkt->hash.rss = rss_hash_res;
+			pkt->ol_flags |= PKT_RX_RSS_HASH;
+		}
 	}
-	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
-		pkt->ol_flags |= PKT_RX_FDIR;
-		if (cqe->sop_drop_qpn !=
-		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
-			uint32_t mark = cqe->sop_drop_qpn;
-
-			pkt->ol_flags |= PKT_RX_FDIR_ID;
-			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+	if (rxq->mark) {
+		uint32_t mark = 0;
+
+		/* If compressed, take flow tag from mini-CQE. */
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+			mark = cqe->sop_drop_qpn;
+		else
+			mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
+				(mcqe->flow_tag_high << 16);
+		if (MLX5_FLOW_MARK_IS_VALID(mark)) {
+			pkt->ol_flags |= PKT_RX_FDIR;
+			if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) {
+				pkt->ol_flags |= PKT_RX_FDIR_ID;
+				pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+			}
 		}
 	}
 	if (rxq->dynf_meta && cqe->flow_table_metadata) {
@@ -1299,10 +1330,20 @@ rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
 	}
 	if (rxq->csum)
 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
-	if (rxq->vlan_strip &&
-	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
-		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
-		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+	if (rxq->vlan_strip) {
+		bool vlan_strip;
+
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+			vlan_strip = cqe->hdr_type_etc &
+				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
+		else
+			vlan_strip = mcqe->hdr_type &
+				     RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
+		if (vlan_strip) {
+			pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+		}
 	}
 	if (rxq->hw_timestamp) {
 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
@@ -1348,7 +1389,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res;
 
 		if (pkt)
 			NEXT(seg) = rep;
@@ -1387,18 +1427,14 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt = seg;
 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
 			pkt->ol_flags &= EXT_ATTACHED_MBUF;
-			/* If compressed, take hash result from mini-CQE. */
-			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
-							cqe->rx_hash_res :
-							mcqe->rx_hash_result);
-			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 			if (rxq->crc_present)
 				len -= RTE_ETHER_CRC_LEN;
 			PKT_LEN(pkt) = len;
 			if (cqe->lro_num_seg > 1) {
 				mlx5_lro_update_hdr
 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
-					 len);
+					 mcqe, rxq, len);
 				pkt->ol_flags |= PKT_RX_LRO;
 				pkt->tso_segsz = len / cqe->lro_num_seg;
 			}
@@ -1468,10 +1504,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum)
+			uint32_t phcsum, uint8_t l4_type)
 {
-	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
-			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
 	/*
 	 * The HW calculates only the TCP payload checksum, need to complete
 	 * the TCP header checksum and the L3 pseudo-header checksum.
@@ -1510,7 +1544,8 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len)
+		    volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len)
 {
 	union {
 		struct rte_ether_hdr *eth;
@@ -1524,6 +1559,7 @@ mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 	};
 	uint16_t proto = h.eth->ether_type;
 	uint32_t phcsum;
+	uint8_t l4_type;
 
 	h.eth++;
 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
@@ -1545,7 +1581,14 @@ mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
 		h.ipv6++;
 	}
-	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	else
+		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
 }
 
 void
@@ -1586,6 +1629,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct mlx5_rxq_data *rxq = dpdk_rxq;
 	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t strd_sz = 1 << rxq->strd_sz_n;
 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
@@ -1602,7 +1646,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint16_t strd_idx;
 		uint32_t byte_cnt;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res = 0;
 		enum mlx5_rqx_code rxq_code;
 
 		if (consumed_strd == strd_n) {
@@ -1618,19 +1661,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (!ret)
 			break;
 		byte_cnt = ret;
-		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
-			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
+		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+		if (rxq->crc_present)
+			len -= RTE_ETHER_CRC_LEN;
+		if (mcqe &&
+		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+			strd_cnt = (len / strd_sz) + !!(len % strd_sz);
+		else
+			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
 		MLX5_ASSERT(strd_cnt);
 		consumed_strd += strd_cnt;
 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
 			continue;
-		if (mcqe == NULL) {
-			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
-			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
-		} else {
-			/* mini-CQE for MPRQ doesn't have hash result. */
-			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
-		}
+		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
+					cqe->wqe_counter :
+					mcqe->stride_idx);
 		MLX5_ASSERT(strd_idx < strd_n);
 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
 			    wq_mask));
@@ -1656,10 +1703,10 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				break;
 			}
 		}
-		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 		if (cqe->lro_num_seg > 1) {
 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
-					    cqe, len);
+					    cqe, mcqe, rxq, len);
 			pkt->ol_flags |= PKT_RX_LRO;
 			pkt->tso_segsz = len / cqe->lro_num_seg;
 		}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3ae5e01d37..071483271c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -126,6 +126,7 @@ struct mlx5_rxq_data {
 	unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */
 	unsigned int lro:1; /* Enable LRO. */
 	unsigned int dynf_meta:1; /* Dynamic metadata is configured. */
+	unsigned int mcqe_format:3; /* Dynamic metadata is configured. */
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t port_id;
@@ -135,6 +136,7 @@ struct mlx5_rxq_data {
 	uint32_t rq_pi;
 	uint32_t cq_ci;
 	uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
+	uint32_t byte_mask;
 	union {
 		struct rxq_zip zip; /* Compressed context. */
 		uint16_t decompressed;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index cf3a795843..8fff7f729c 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -108,7 +108,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	const vector unsigned short rxdf_sel_mask =
 		(vector unsigned short){
 			0xffff, 0xffff, 0, 0, 0, 0xffff, 0, 0};
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	vector unsigned char ol_flags = (vector unsigned char){0};
+	vector unsigned char ol_flags_mask = (vector unsigned char){0};
 	unsigned int pos;
 	unsigned int i;
 	unsigned int inv = 0;
@@ -231,11 +232,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			vec_sel((vector unsigned long)shmask,
 			(vector unsigned long)invalid_mask, shmask);
 
-		mcqe1 = (vector unsigned char)
+		byte_cnt = (vector unsigned char)
+			vec_sel((vector unsigned short)
 			vec_sro((vector unsigned short)mcqe1,
 			(vector unsigned char){32}),
-		byte_cnt = (vector unsigned char)
-			vec_sel((vector unsigned short)mcqe1,
 			(vector unsigned short)mcqe2, mcqe_sel_mask);
 		byte_cnt = vec_perm(byte_cnt, zero, len_shuf_mask);
 		byte_cnt = (vector unsigned char)
@@ -255,11 +255,216 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 #endif
 
 		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+			if (rxq->mcqe_format !=
+			    MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
+				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+
+				/* E.1 store flow tag (rte_flow mark). */
+				elts[pos]->hash.fdir.hi = flow_tag;
+				elts[pos + 1]->hash.fdir.hi = flow_tag;
+				elts[pos + 2]->hash.fdir.hi = flow_tag;
+				elts[pos + 3]->hash.fdir.hi = flow_tag;
+			} else {
+				const vector unsigned char flow_mark_adj =
+					(vector unsigned char)
+					(vector unsigned int){
+					-1, -1, -1, -1};
+				const vector unsigned char flow_mark_shuf =
+					(vector unsigned char){
+					-1, -1, -1, -1,
+					-1, -1, -1, -1,
+					12,  8,  9, -1,
+					 4,  0,  1,  -1};
+				const vector unsigned char ft_mask =
+					(vector unsigned char)
+					(vector unsigned int){
+					0xffffff00, 0xffffff00,
+					0xffffff00, 0xffffff00};
+				const vector unsigned char fdir_flags =
+					(vector unsigned char)
+					(vector unsigned int){
+					PKT_RX_FDIR, PKT_RX_FDIR,
+					PKT_RX_FDIR, PKT_RX_FDIR};
+				const vector unsigned char fdir_all_flags =
+					(vector unsigned char)
+					(vector unsigned int){
+					PKT_RX_FDIR | PKT_RX_FDIR_ID,
+					PKT_RX_FDIR | PKT_RX_FDIR_ID,
+					PKT_RX_FDIR | PKT_RX_FDIR_ID,
+					PKT_RX_FDIR | PKT_RX_FDIR_ID};
+				vector unsigned char fdir_id_flags =
+					(vector unsigned char)
+					(vector unsigned int){
+					PKT_RX_FDIR_ID, PKT_RX_FDIR_ID,
+					PKT_RX_FDIR_ID, PKT_RX_FDIR_ID};
+				/* Extract flow_tag field. */
+				vector unsigned char ftag0 = vec_perm(mcqe1,
+							zero, flow_mark_shuf);
+				vector unsigned char ftag1 = vec_perm(mcqe2,
+							zero, flow_mark_shuf);
+				vector unsigned char ftag =
+					(vector unsigned char)
+					vec_mergel((vector unsigned int)ftag0,
+					(vector unsigned int)ftag1);
+				vector unsigned char invalid_mask =
+					(vector unsigned char)
+					vec_cmpeq((vector unsigned int)ftag,
+					(vector unsigned int)zero);
+
+				ol_flags_mask = (vector unsigned char)
+					vec_or((vector unsigned long)
+					ol_flags_mask,
+					(vector unsigned long)fdir_all_flags);
+
+				/* Set PKT_RX_FDIR if flow tag is non-zero. */
+				invalid_mask = (vector unsigned char)
+					vec_cmpeq((vector unsigned int)ftag,
+					(vector unsigned int)zero);
+				ol_flags = (vector unsigned char)
+					vec_or((vector unsigned long)ol_flags,
+					(vector unsigned long)
+					vec_andc((vector unsigned long)
+					fdir_flags,
+					(vector unsigned long)invalid_mask));
+				ol_flags_mask = (vector unsigned char)
+					vec_or((vector unsigned long)
+					ol_flags_mask,
+					(vector unsigned long)fdir_flags);
+
+				/* Mask out invalid entries. */
+				fdir_id_flags = (vector unsigned char)
+					vec_andc((vector unsigned long)
+					fdir_id_flags,
+					(vector unsigned long)invalid_mask);
+
+				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+				ol_flags = (vector unsigned char)
+					vec_or((vector unsigned long)ol_flags,
+					(vector unsigned long)
+					vec_andc((vector unsigned long)
+					fdir_id_flags,
+					(vector unsigned long)
+					vec_cmpeq((vector unsigned int)ftag,
+					(vector unsigned int)ft_mask)));
+
+				ftag = (vector unsigned char)
+					((vector unsigned int)ftag +
+					(vector unsigned int)flow_mark_adj);
+				elts[pos]->hash.fdir.hi =
+					((vector unsigned int)ftag)[0];
+				elts[pos + 1]->hash.fdir.hi =
+					((vector unsigned int)ftag)[1];
+				elts[pos + 2]->hash.fdir.hi =
+					((vector unsigned int)ftag)[2];
+				elts[pos + 3]->hash.fdir.hi =
+					((vector unsigned int)ftag)[3];
+			}
+		}
+		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
+			if (rxq->mcqe_format ==
+			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+				const uint8_t pkt_info =
+					(cq->pkt_info & 0x3) << 6;
+				const uint8_t pkt_hdr0 =
+					mcq[pos % 8].hdr_type;
+				const uint8_t pkt_hdr1 =
+					mcq[pos % 8 + 1].hdr_type;
+				const uint8_t pkt_hdr2 =
+					mcq[pos % 8 + 2].hdr_type;
+				const uint8_t pkt_hdr3 =
+					mcq[pos % 8 + 3].hdr_type;
+				const vector unsigned char vlan_mask =
+					(vector unsigned char)
+					(vector unsigned int) {
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
+					(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED)};
+				const vector unsigned char cv_mask =
+					(vector unsigned char)
+					(vector unsigned int) {
+					MLX5_CQE_VLAN_STRIPPED,
+					MLX5_CQE_VLAN_STRIPPED,
+					MLX5_CQE_VLAN_STRIPPED,
+					MLX5_CQE_VLAN_STRIPPED};
+				vector unsigned char pkt_cv =
+					(vector unsigned char)
+					(vector unsigned int) {
+					pkt_hdr0 & 0x1, pkt_hdr1 & 0x1,
+					pkt_hdr2 & 0x1, pkt_hdr3 & 0x1};
+
+				ol_flags_mask = (vector unsigned char)
+					vec_or((vector unsigned long)
+					ol_flags_mask,
+					(vector unsigned long)vlan_mask);
+				ol_flags = (vector unsigned char)
+					vec_or((vector unsigned long)ol_flags,
+					(vector unsigned long)
+					vec_and((vector unsigned long)vlan_mask,
+					(vector unsigned long)
+					vec_cmpeq((vector unsigned int)pkt_cv,
+					(vector unsigned int)cv_mask)));
+				elts[pos]->packet_type =
+					mlx5_ptype_table[(pkt_hdr0 >> 2) |
+							 pkt_info];
+				elts[pos + 1]->packet_type =
+					mlx5_ptype_table[(pkt_hdr1 >> 2) |
+							 pkt_info];
+				elts[pos + 2]->packet_type =
+					mlx5_ptype_table[(pkt_hdr2 >> 2) |
+							 pkt_info];
+				elts[pos + 3]->packet_type =
+					mlx5_ptype_table[(pkt_hdr3 >> 2) |
+							 pkt_info];
+				if (rxq->tunnel) {
+					elts[pos]->packet_type |=
+						!!(((pkt_hdr0 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 1]->packet_type |=
+						!!(((pkt_hdr1 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 2]->packet_type |=
+						!!(((pkt_hdr2 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 3]->packet_type |=
+						!!(((pkt_hdr3 >> 2) |
+						pkt_info) & (1 << 6));
+				}
+			}
+			const vector unsigned char hash_mask =
+				(vector unsigned char)(vector unsigned int) {
+					PKT_RX_RSS_HASH,
+					PKT_RX_RSS_HASH,
+					PKT_RX_RSS_HASH,
+					PKT_RX_RSS_HASH};
+			const vector unsigned char rearm_flags =
+				(vector unsigned char)(vector unsigned int) {
+				(uint32_t)t_pkt->ol_flags,
+				(uint32_t)t_pkt->ol_flags,
+				(uint32_t)t_pkt->ol_flags,
+				(uint32_t)t_pkt->ol_flags};
+
+			ol_flags_mask = (vector unsigned char)
+				vec_or((vector unsigned long)ol_flags_mask,
+				(vector unsigned long)hash_mask);
+			ol_flags = (vector unsigned char)
+				vec_or((vector unsigned long)ol_flags,
+				(vector unsigned long)
+				vec_andc((vector unsigned long)rearm_flags,
+				(vector unsigned long)ol_flags_mask));
+
+			elts[pos]->ol_flags =
+				((vector unsigned int)ol_flags)[0];
+			elts[pos + 1]->ol_flags =
+				((vector unsigned int)ol_flags)[1];
+			elts[pos + 2]->ol_flags =
+				((vector unsigned int)ol_flags)[2];
+			elts[pos + 3]->ol_flags =
+				((vector unsigned int)ol_flags)[3];
+			elts[pos]->hash.rss = 0;
+			elts[pos + 1]->hash.rss = 0;
+			elts[pos + 2]->hash.rss = 0;
+			elts[pos + 3]->hash.rss = 0;
 		}
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 47b6692942..d5fe00857c 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -111,7 +111,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
 		0, 0
 	};
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	uint32x4_t ol_flags = {0, 0, 0, 0};
+	uint32x4_t ol_flags_mask = {0, 0, 0, 0};
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	uint32_t rcvd_byte = 0;
 #endif
@@ -198,11 +199,139 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
 #endif
 		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+			if (rxq->mcqe_format !=
+			    MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
+				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+
+				/* E.1 store flow tag (rte_flow mark). */
+				elts[pos]->hash.fdir.hi = flow_tag;
+				elts[pos + 1]->hash.fdir.hi = flow_tag;
+				elts[pos + 2]->hash.fdir.hi = flow_tag;
+				elts[pos + 3]->hash.fdir.hi = flow_tag;
+			}  else {
+				const uint32x4_t flow_mark_adj = {
+					-1, -1, -1, -1 };
+				const uint8x16_t flow_mark_shuf = {
+					28, 24, 25, -1,
+					20, 16, 17, -1,
+					12,  8,  9, -1,
+					 4,  0,  1, -1};
+				/* Extract flow_tag field. */
+				const uint32x4_t ft_mask =
+					vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT);
+				const uint32x4_t fdir_flags =
+					vdupq_n_u32(PKT_RX_FDIR);
+				const uint32x4_t fdir_all_flags =
+					vdupq_n_u32(PKT_RX_FDIR |
+						    PKT_RX_FDIR_ID);
+				uint32x4_t fdir_id_flags =
+					vdupq_n_u32(PKT_RX_FDIR_ID);
+				uint32x4_t invalid_mask, ftag;
+
+				__asm__ volatile
+				/* A.1 load mCQEs into a 128bit register. */
+				("ld1 {v16.16b - v17.16b}, [%[mcq]]\n\t"
+				/* Extract flow_tag. */
+				 "tbl %[ftag].16b, {v16.16b - v17.16b}, %[flow_mark_shuf].16b\n\t"
+				: [ftag]"=&w"(ftag)
+				: [mcq]"r"(p),
+				  [flow_mark_shuf]"w"(flow_mark_shuf)
+				: "memory", "v16", "v17");
+				invalid_mask = vceqzq_u32(ftag);
+				ol_flags_mask = vorrq_u32(ol_flags_mask,
+							  fdir_all_flags);
+				/* Set PKT_RX_FDIR if flow tag is non-zero. */
+				ol_flags = vorrq_u32(ol_flags,
+					vbicq_u32(fdir_flags, invalid_mask));
+				/* Mask out invalid entries. */
+				fdir_id_flags = vbicq_u32(fdir_id_flags,
+							  invalid_mask);
+				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+				ol_flags = vorrq_u32(ol_flags,
+					vbicq_u32(fdir_id_flags,
+						  vceqq_u32(ftag, ft_mask)));
+				ftag = vaddq_u32(ftag, flow_mark_adj);
+				elts[pos]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 3);
+				elts[pos + 1]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 2);
+				elts[pos + 2]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 1);
+				elts[pos + 3]->hash.fdir.hi =
+					vgetq_lane_u32(ftag, 0);
+				}
+		}
+		if (unlikely(rxq->mcqe_format !=
+			     MLX5_CQE_RESP_FORMAT_HASH)) {
+			if (rxq->mcqe_format ==
+			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+				const uint8_t pkt_info =
+					(cq->pkt_info & 0x3) << 6;
+				const uint8_t pkt_hdr0 =
+					mcq[pos % 8].hdr_type;
+				const uint8_t pkt_hdr1 =
+					mcq[pos % 8 + 1].hdr_type;
+				const uint8_t pkt_hdr2 =
+					mcq[pos % 8 + 2].hdr_type;
+				const uint8_t pkt_hdr3 =
+					mcq[pos % 8 + 3].hdr_type;
+				const uint32x4_t vlan_mask =
+					vdupq_n_u32(PKT_RX_VLAN |
+						    PKT_RX_VLAN_STRIPPED);
+				const uint32x4_t cv_mask =
+					vdupq_n_u32(MLX5_CQE_VLAN_STRIPPED);
+				const uint32x4_t pkt_cv = {
+					pkt_hdr0 & 0x1, pkt_hdr1 & 0x1,
+					pkt_hdr2 & 0x1, pkt_hdr3 & 0x1};
+
+				ol_flags_mask = vorrq_u32(ol_flags_mask,
+							  vlan_mask);
+				ol_flags = vorrq_u32(ol_flags,
+						vandq_u32(vlan_mask,
+						vceqq_u32(pkt_cv, cv_mask)));
+				elts[pos]->packet_type =
+					mlx5_ptype_table[(pkt_hdr0 >> 2) |
+							 pkt_info];
+				elts[pos + 1]->packet_type =
+					mlx5_ptype_table[(pkt_hdr1 >> 2) |
+							 pkt_info];
+				elts[pos + 2]->packet_type =
+					mlx5_ptype_table[(pkt_hdr2 >> 2) |
+							 pkt_info];
+				elts[pos + 3]->packet_type =
+					mlx5_ptype_table[(pkt_hdr3 >> 2) |
+							 pkt_info];
+				if (rxq->tunnel) {
+					elts[pos]->packet_type |=
+						!!(((pkt_hdr0 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 1]->packet_type |=
+						!!(((pkt_hdr1 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 2]->packet_type |=
+						!!(((pkt_hdr2 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 3]->packet_type |=
+						!!(((pkt_hdr3 >> 2) |
+						pkt_info) & (1 << 6));
+				}
+			}
+			const uint32x4_t hash_flags =
+				vdupq_n_u32(PKT_RX_RSS_HASH);
+			const uint32x4_t rearm_flags =
+				vdupq_n_u32((uint32_t)t_pkt->ol_flags);
+
+			ol_flags_mask = vorrq_u32(ol_flags_mask, hash_flags);
+			ol_flags = vorrq_u32(ol_flags,
+					vbicq_u32(rearm_flags, ol_flags_mask));
+			elts[pos]->ol_flags = vgetq_lane_u32(ol_flags, 3);
+			elts[pos + 1]->ol_flags = vgetq_lane_u32(ol_flags, 2);
+			elts[pos + 2]->ol_flags = vgetq_lane_u32(ol_flags, 1);
+			elts[pos + 3]->ol_flags = vgetq_lane_u32(ol_flags, 0);
+			elts[pos]->hash.rss = 0;
+			elts[pos + 1]->hash.rss = 0;
+			elts[pos + 2]->hash.rss = 0;
+			elts[pos + 3]->hash.rss = 0;
 		}
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 59662fa12d..732e5859a4 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -104,7 +104,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			      0,
 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
 			      0, 0);
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+	__m128i ol_flags = _mm_setzero_si128();
+	__m128i ol_flags_mask = _mm_setzero_si128();
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	const __m128i zero = _mm_setzero_si128();
 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
@@ -175,19 +176,152 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 					      (mcqe_n - pos) *
 					      sizeof(uint16_t) * 8);
 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
-		mcqe1 = _mm_srli_si128(mcqe1, 4);
-		byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc);
+		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
+					   mcqe2, 0xcc);
 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
 #endif
 		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+			if (rxq->mcqe_format !=
+				MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
+				const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+
+				/* E.1 store flow tag (rte_flow mark). */
+				elts[pos]->hash.fdir.hi = flow_tag;
+				elts[pos + 1]->hash.fdir.hi = flow_tag;
+				elts[pos + 2]->hash.fdir.hi = flow_tag;
+				elts[pos + 3]->hash.fdir.hi = flow_tag;
+			} else {
+				const __m128i flow_mark_adj =
+					_mm_set_epi32(-1, -1, -1, -1);
+				const __m128i flow_mark_shuf =
+					_mm_set_epi8(-1,  1,  0,  4,
+						     -1,  9,  8, 12,
+						     -1, -1, -1, -1,
+						     -1, -1, -1, -1);
+				const __m128i ft_mask =
+					_mm_set1_epi32(0xffffff00);
+				const __m128i fdir_flags =
+					_mm_set1_epi32(PKT_RX_FDIR);
+				const __m128i fdir_all_flags =
+					_mm_set1_epi32(PKT_RX_FDIR |
+						       PKT_RX_FDIR_ID);
+				__m128i fdir_id_flags =
+					_mm_set1_epi32(PKT_RX_FDIR_ID);
+
+				/* Extract flow_tag field. */
+				__m128i ftag0 =
+					_mm_shuffle_epi8(mcqe1, flow_mark_shuf);
+				__m128i ftag1 =
+					_mm_shuffle_epi8(mcqe2, flow_mark_shuf);
+				__m128i ftag =
+					_mm_unpackhi_epi64(ftag0, ftag1);
+				__m128i invalid_mask =
+					_mm_cmpeq_epi32(ftag, zero);
+
+				ol_flags_mask = _mm_or_si128(ol_flags_mask,
+							     fdir_all_flags);
+				/* Set PKT_RX_FDIR if flow tag is non-zero. */
+				ol_flags = _mm_or_si128(ol_flags,
+					_mm_andnot_si128(invalid_mask,
+							 fdir_flags));
+				/* Mask out invalid entries. */
+				fdir_id_flags = _mm_andnot_si128(invalid_mask,
+								 fdir_id_flags);
+				/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+				ol_flags = _mm_or_si128(ol_flags,
+					_mm_andnot_si128(_mm_cmpeq_epi32(ftag,
+							 ft_mask),
+					fdir_id_flags));
+				ftag = _mm_add_epi32(ftag, flow_mark_adj);
+				elts[pos]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 0);
+				elts[pos + 1]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 1);
+				elts[pos + 2]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 2);
+				elts[pos + 3]->hash.fdir.hi =
+						_mm_extract_epi32(ftag, 3);
+			}
+		}
+		if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
+			if (rxq->mcqe_format ==
+			    MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+				const uint8_t pkt_info =
+					(cq->pkt_info & 0x3) << 6;
+				const uint8_t pkt_hdr0 =
+					_mm_extract_epi8(mcqe1, 0);
+				const uint8_t pkt_hdr1 =
+					_mm_extract_epi8(mcqe1, 8);
+				const uint8_t pkt_hdr2 =
+					_mm_extract_epi8(mcqe2, 0);
+				const uint8_t pkt_hdr3 =
+					_mm_extract_epi8(mcqe2, 8);
+				const __m128i vlan_mask =
+					_mm_set1_epi32(PKT_RX_VLAN |
+						       PKT_RX_VLAN_STRIPPED);
+				const __m128i cv_mask =
+					_mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
+				const __m128i pkt_cv =
+					_mm_set_epi32(pkt_hdr0 & 0x1,
+						      pkt_hdr1 & 0x1,
+						      pkt_hdr2 & 0x1,
+						      pkt_hdr3 & 0x1);
+
+				ol_flags_mask = _mm_or_si128(ol_flags_mask,
+							     vlan_mask);
+				ol_flags = _mm_or_si128(ol_flags,
+					_mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
+					cv_mask), vlan_mask));
+				elts[pos]->packet_type =
+					mlx5_ptype_table[(pkt_hdr0 >> 2) |
+							 pkt_info];
+				elts[pos + 1]->packet_type =
+					mlx5_ptype_table[(pkt_hdr1 >> 2) |
+							 pkt_info];
+				elts[pos + 2]->packet_type =
+					mlx5_ptype_table[(pkt_hdr2 >> 2) |
+							 pkt_info];
+				elts[pos + 3]->packet_type =
+					mlx5_ptype_table[(pkt_hdr3 >> 2) |
+							 pkt_info];
+				if (rxq->tunnel) {
+					elts[pos]->packet_type |=
+						!!(((pkt_hdr0 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 1]->packet_type |=
+						!!(((pkt_hdr1 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 2]->packet_type |=
+						!!(((pkt_hdr2 >> 2) |
+						pkt_info) & (1 << 6));
+					elts[pos + 3]->packet_type |=
+						!!(((pkt_hdr3 >> 2) |
+						pkt_info) & (1 << 6));
+				}
+			}
+			const __m128i hash_flags =
+				_mm_set1_epi32(PKT_RX_RSS_HASH);
+			const __m128i rearm_flags =
+				_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
+
+			ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
+			ol_flags = _mm_or_si128(ol_flags,
+				_mm_andnot_si128(ol_flags_mask, rearm_flags));
+			elts[pos]->ol_flags =
+				_mm_extract_epi32(ol_flags, 0);
+			elts[pos + 1]->ol_flags =
+				_mm_extract_epi32(ol_flags, 1);
+			elts[pos + 2]->ol_flags =
+				_mm_extract_epi32(ol_flags, 2);
+			elts[pos + 3]->ol_flags =
+				_mm_extract_epi32(ol_flags, 3);
+			elts[pos]->hash.rss = 0;
+			elts[pos + 1]->hash.rss = 0;
+			elts[pos + 2]->hash.rss = 0;
+			elts[pos + 3]->hash.rss = 0;
 		}
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
@@ -251,12 +385,9 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 					  rxq->hw_timestamp * PKT_RX_TIMESTAMP);
 	__m128i cv_flags;
 	const __m128i zero = _mm_setzero_si128();
-	const __m128i ptype_mask =
-		_mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
-	const __m128i ptype_ol_mask =
-		_mm_set_epi32(0x106, 0x106, 0x106, 0x106);
-	const __m128i pinfo_mask =
-		_mm_set_epi32(0x3, 0x3, 0x3, 0x3);
+	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
+	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
+	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
 	const __m128i cv_flag_sel =
 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
 			     (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
@@ -268,13 +399,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 			     (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
 			     0);
 	const __m128i cv_mask =
-		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		_mm_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
 	const __m128i mbuf_init =
 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-11-01 23:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-28  2:37 [dpdk-dev] [PATCH] net/mlx5: support Flow Tag and Packet Header miniCQEs Alexander Kozyrev
2020-11-01 16:27 ` [dpdk-dev] [PATCH v2] " Alexander Kozyrev
2020-11-01 16:38   ` Slava Ovsiienko
2020-11-01 16:14 Alexander Kozyrev
2020-11-01 23:56 ` Raslan Darawsheh

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git