DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/3] net/mlx5: optimize mprq memcpy
@ 2020-09-07 19:32 Aman Kumar
  2020-09-07 19:32 ` [dpdk-dev] [PATCH 2/3] net/mlx5: add non temporal store for WQE fields Aman Kumar
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Aman Kumar @ 2020-09-07 19:32 UTC (permalink / raw)
  To: dev; +Cc: rasland, keesang.song, aman.kumar

add non temporal load and temporal store for mprq memcpy.
Enabling CONFIG_RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY=y in
dpdk config will enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/common_base                      |  1 +
 drivers/net/mlx5/mlx5.c                 | 12 ++++
 drivers/net/mlx5/mlx5.h                 |  3 +
 drivers/net/mlx5/mlx5_rxq.c             |  3 +
 drivers/net/mlx5/mlx5_rxtx.c            | 17 ++++-
 drivers/net/mlx5/mlx5_rxtx.h            |  3 +
 lib/librte_eal/x86/include/rte_memcpy.h | 92 +++++++++++++++++++++++++
 7 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/config/common_base b/config/common_base
index fbf0ee70c..1476cf334 100644
--- a/config/common_base
+++ b/config/common_base
@@ -371,6 +371,7 @@ CONFIG_RTE_LIBRTE_MLX4_DEBUG=n
 # ConnectX-6 & BlueField (MLX5) PMD
 #
 CONFIG_RTE_LIBRTE_MLX5_PMD=n
+CONFIG_RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY=n
 CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
 
 #
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1e4c695f8..6eb85dfac 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -163,6 +163,11 @@
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1621,6 +1626,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1681,6 +1690,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 78d6eb728..09dc90953 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -236,6 +236,9 @@ struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 79eb8f8d7..bee5c03bc 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -2302,6 +2302,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
 	mprq_stride_size = non_scatter_min_mbuf_size <=
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1b71e9422..62ade3775 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1774,8 +1774,21 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		    rxq->mprq_repl == NULL ||
 		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				if ((rxq->mprq_tstore_memcpy) &&
+				    (!(((uintptr_t)(rte_pktmbuf_mtod(pkt,
+								     void *)) |
+					(uintptr_t)addr) & ALIGNMENT_MASK))) {
+					memcpy_aligned_rx_tstore_16B(
+						rte_pktmbuf_mtod(pkt, void *),
+						addr, len);
+				} else {
+#endif
+					rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+							addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				}
+#endif
 				DATA_LEN(pkt) = len;
 			} else if (rxq->strd_scatter_en) {
 				struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index c02a007c8..72763962f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -153,6 +153,9 @@ struct mlx5_rxq_data {
 	uint32_t tunnel; /* Tunnel information. */
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_obj_type {
diff --git a/lib/librte_eal/x86/include/rte_memcpy.h b/lib/librte_eal/x86/include/rte_memcpy.h
index 9c67232df..6345572a7 100644
--- a/lib/librte_eal/x86/include/rte_memcpy.h
+++ b/lib/librte_eal/x86/include/rte_memcpy.h
@@ -874,6 +874,98 @@ rte_memcpy(void *dst, const void *src, size_t n)
 		return rte_memcpy_generic(dst, src, n);
 }
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+static __rte_always_inline
+void copy16B_ts(void *dst, void *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+static __rte_always_inline
+void copy32B_ts(void *dst, void *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static __rte_always_inline
+void copy64B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+}
+
+static __rte_always_inline
+void copy128B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 64));
+	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 96));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3);
+}
+
+static __rte_always_inline
+void memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
+{
+	while (len >= 128) {
+		copy128B_ts(dst, src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		copy64B_ts(dst, src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		copy32B_ts(dst, src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		copy16B_ts(dst, src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+}
+#endif
+
 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 #pragma GCC diagnostic pop
 #endif
-- 
2.25.1


-- 



_Disclaimer: _(c) 2020 VVDN Technologies Pvt. Ltd. This e-mail contains 
PRIVILEGED AND CONFIDENTIAL INFORMATION intended solely for the use of the 
addressee(s). If you are not the intended recipient, please notify the 
sender by e-mail and delete the original message. Further, you are not to 
copy, disclose, or distribute this e-mail or its contents to any other 
person and any such actions are unlawful._
_
_
_
__

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH 2/3] net/mlx5: add non temporal store for WQE fields
  2020-09-07 19:32 [dpdk-dev] [PATCH 1/3] net/mlx5: optimize mprq memcpy Aman Kumar
@ 2020-09-07 19:32 ` Aman Kumar
  2020-09-07 19:32 ` [dpdk-dev] [PATCH 3/3] config: added build config file for AMD EPYC platform Aman Kumar
  2020-09-25  3:16 ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2 siblings, 0 replies; 13+ messages in thread
From: Aman Kumar @ 2020-09-07 19:32 UTC (permalink / raw)
  To: dev; +Cc: rasland, keesang.song, aman.kumar

add non temporal store for few WQE fields to
optimize data path. This can be enable by making
CONFG_RTE_LIBRTE_MLX5_NT_STORE=y in dpdk config.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/common_base               |  1 +
 drivers/net/mlx5/mlx5.c          | 17 +++++++++++++++++
 drivers/net/mlx5/mlx5.h          |  4 ++++
 drivers/net/mlx5/mlx5_rxq.c      |  3 +++
 drivers/net/mlx5/mlx5_rxtx.c     | 20 +++++++++++++++++---
 drivers/net/mlx5/mlx5_rxtx.h     |  6 ++++++
 drivers/net/mlx5/mlx5_rxtx_vec.h | 28 +++++++++++++++++++++++-----
 drivers/net/mlx5/mlx5_txq.c      |  3 +++
 8 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/config/common_base b/config/common_base
index 1476cf334..bbe74833b 100644
--- a/config/common_base
+++ b/config/common_base
@@ -372,6 +372,7 @@ CONFIG_RTE_LIBRTE_MLX4_DEBUG=n
 #
 CONFIG_RTE_LIBRTE_MLX5_PMD=n
 CONFIG_RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY=n
+CONFIG_RTE_LIBRTE_MLX5_NT_STORE=n
 CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
 
 #
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 6eb85dfac..8e1b7df23 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -167,6 +167,13 @@
 /* mprq_tstore_memcpy */
 #define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+/* tx_wqe_field_ntstore */
+#define MLX5_TX_WQE_FIELD_NTSTORE "tx_wqe_field_ntstore"
+
+/* vec_rx_wqe_field_ntstore */
+#define MLX5_VEC_RX_WQE_FIELD_NTSTORE "vec_rx_wqe_field_ntstore"
+#endif
 
 /*
  * Device parameter to configure the total data buffer size for a single
@@ -1629,6 +1636,12 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
 		config->mprq_tstore_memcpy = tmp;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	} else if (strcmp(MLX5_TX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->tx_wqe_field_ntstore = tmp;
+	} else if (strcmp(MLX5_VEC_RX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->vec_rx_wqe_field_ntstore = tmp;
 #endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
@@ -1692,6 +1705,10 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_DECAP_EN,
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		MLX5_TX_WQE_FIELD_NTSTORE,
+		MLX5_VEC_RX_WQE_FIELD_NTSTORE,
 #endif
 		NULL,
 	};
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 09dc90953..4a816cb2e 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -239,6 +239,10 @@ struct mlx5_dev_config {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index bee5c03bc..ceb33e5c5 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -2304,6 +2304,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		tmpl->irq = 1;
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	tmpl->rxq.vec_rx_wqe_field_ntstore = config->vec_rx_wqe_field_ntstore;
 #endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 62ade3775..6bcdc44a5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -2318,6 +2318,9 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 {
 	uint16_t head = txq->elts_head;
 	unsigned int part;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint32_t flags;
+#endif
 
 	part = MLX5_TXOFF_CONFIG(INLINE) ?
 	       0 : loc->pkts_sent - loc->pkts_copy;
@@ -2331,9 +2334,20 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 		txq->elts_comp = head;
 		if (MLX5_TXOFF_CONFIG(INLINE))
 			txq->wqe_comp = txq->wqe_ci;
-		/* Request unconditional completion on last WQE. */
-		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
-					    MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (txq->tx_wqe_field_ntstore) {
+			flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+			_mm_stream_si32(((void *)(uintptr_t)&last->cseg.flags),
+					flags);
+		} else {
+#endif
+			/* Request unconditional completion on last WQE. */
+			last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 		/* Save elts_head in dedicated free on completion queue. */
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 72763962f..b031eff0b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -156,6 +156,9 @@ struct mlx5_rxq_data {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_obj_type {
@@ -324,6 +327,9 @@ struct mlx5_txq_data {
 	int32_t ts_offset; /* Timestamp field dynamic offset. */
 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+#endif
 #ifndef RTE_ARCH_64
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index 6ddcbfb0a..62a07ef00 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -86,6 +86,10 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 	volatile struct mlx5_wqe_data_seg *wq =
 		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
 	unsigned int i;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint64_t buf_addr2;
+	register uint32_t lkey_t;
+#endif
 
 	MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
 	MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
@@ -107,11 +111,25 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 		 * impact the performance.
 		 */
 		buf_addr = elts[i]->buf_addr;
-		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
-					      RTE_PKTMBUF_HEADROOM);
-		/* If there's only one MR, no need to replace LKey in WQE. */
-		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (rxq->vec_rx_wqe_field_ntstore) {
+			buf_addr2 = (uint64_t)rte_cpu_to_be_64((uintptr_t)buf_addr + RTE_PKTMBUF_HEADROOM);
+			_mm_stream_si64(((void *)(uintptr_t)&wq[i].addr), buf_addr2);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) {
+				lkey_t = (uint32_t)mlx5_rx_mb2mr(rxq, elts[i]);
+				_mm_stream_si32(((void *)(uintptr_t)&wq[i].lkey), lkey_t);
+			}
+		} else {
+#endif
+			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+					RTE_PKTMBUF_HEADROOM);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 	}
 	rxq->rq_ci += n;
 	/* Prevent overflowing into consumed mbufs. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 21fe16b7e..8feac4bdc 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -1562,6 +1562,9 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 					    DEV_TX_OFFLOAD_UDP_TNL_TSO);
 	bool vlan_inline;
 	unsigned int temp;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	txq_ctrl->txq.tx_wqe_field_ntstore = config->tx_wqe_field_ntstore;
+#endif
 
 	if (config->txqs_inline == MLX5_ARG_UNSET)
 		txqs_inline =
-- 
2.25.1


-- 



_Disclaimer: _(c) 2020 VVDN Technologies Pvt. Ltd. This e-mail contains 
PRIVILEGED AND CONFIDENTIAL INFORMATION intended solely for the use of the 
addressee(s). If you are not the intended recipient, please notify the 
sender by e-mail and delete the original message. Further, you are not to 
copy, disclose, or distribute this e-mail or its contents to any other 
person and any such actions are unlawful._
_
_
_
__

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH 3/3] config: added build config file for AMD EPYC platform
  2020-09-07 19:32 [dpdk-dev] [PATCH 1/3] net/mlx5: optimize mprq memcpy Aman Kumar
  2020-09-07 19:32 ` [dpdk-dev] [PATCH 2/3] net/mlx5: add non temporal store for WQE fields Aman Kumar
@ 2020-09-07 19:32 ` Aman Kumar
  2020-09-08  9:11   ` David Marchand
  2020-09-25  3:16 ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2 siblings, 1 reply; 13+ messages in thread
From: Aman Kumar @ 2020-09-07 19:32 UTC (permalink / raw)
  To: dev; +Cc: rasland, keesang.song, aman.kumar

add build config specific to AMD EPYC platform

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/defconfig_x86_64-amdEPYC-linux-gcc | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 config/defconfig_x86_64-amdEPYC-linux-gcc

diff --git a/config/defconfig_x86_64-amdEPYC-linux-gcc b/config/defconfig_x86_64-amdEPYC-linux-gcc
new file mode 100644
index 000000000..8c1aa34e2
--- /dev/null
+++ b/config/defconfig_x86_64-amdEPYC-linux-gcc
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2010-2014 Intel Corporation
+
+#include "common_linux"
+
+CONFIG_RTE_MACHINE="native"
+
+CONFIG_RTE_ARCH="x86_64"
+CONFIG_RTE_ARCH_X86_64=y
+CONFIG_RTE_ARCH_X86=y
+CONFIG_RTE_ARCH_64=y
+
+CONFIG_RTE_TOOLCHAIN="gcc"
+CONFIG_RTE_TOOLCHAIN_GCC=y
+CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_LIBRTE_MLX5_PMD=n
+CONFIG_RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY=n
+CONFIG_RTE_LIBRTE_MLX5_NT_STORE=n
-- 
2.25.1


-- 



_Disclaimer: _(c) 2020 VVDN Technologies Pvt. Ltd. This e-mail contains 
PRIVILEGED AND CONFIDENTIAL INFORMATION intended solely for the use of the 
addressee(s). If you are not the intended recipient, please notify the 
sender by e-mail and delete the original message. Further, you are not to 
copy, disclose, or distribute this e-mail or its contents to any other 
person and any such actions are unlawful._
_
_
_
__

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH 3/3] config: added build config file for AMD EPYC platform
  2020-09-07 19:32 ` [dpdk-dev] [PATCH 3/3] config: added build config file for AMD EPYC platform Aman Kumar
@ 2020-09-08  9:11   ` David Marchand
  0 siblings, 0 replies; 13+ messages in thread
From: David Marchand @ 2020-09-08  9:11 UTC (permalink / raw)
  To: aman.kumar; +Cc: dev, Raslan, Song, Keesang, Thomas Monjalon

Hello Aman,

On Mon, Sep 7, 2020 at 9:33 PM Aman Kumar <aman.kumar@vvdntech.in> wrote:
>
> add build config specific to AMD EPYC platform
>
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>

Make support has just been dropped from the main branch.
You will have to rework this series so that this feature is
handled/configured with meson or, even better, resolved at runtime.


> ---
>  config/defconfig_x86_64-amdEPYC-linux-gcc | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
>  create mode 100644 config/defconfig_x86_64-amdEPYC-linux-gcc
>
> diff --git a/config/defconfig_x86_64-amdEPYC-linux-gcc b/config/defconfig_x86_64-amdEPYC-linux-gcc
> new file mode 100644
> index 000000000..8c1aa34e2
> --- /dev/null
> +++ b/config/defconfig_x86_64-amdEPYC-linux-gcc
> @@ -0,0 +1,18 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2010-2014 Intel Corporation
> +
> +#include "common_linux"
> +
> +CONFIG_RTE_MACHINE="native"
> +
> +CONFIG_RTE_ARCH="x86_64"
> +CONFIG_RTE_ARCH_X86_64=y
> +CONFIG_RTE_ARCH_X86=y
> +CONFIG_RTE_ARCH_64=y
> +
> +CONFIG_RTE_TOOLCHAIN="gcc"
> +CONFIG_RTE_TOOLCHAIN_GCC=y
> +CONFIG_RTE_MAX_LCORE=256
> +CONFIG_RTE_LIBRTE_MLX5_PMD=n
> +CONFIG_RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY=n
> +CONFIG_RTE_LIBRTE_MLX5_NT_STORE=n
> --
> 2.25.1
>
>
> --
>
>
>
> _Disclaimer: _(c) 2020 VVDN Technologies Pvt. Ltd. This e-mail contains
> PRIVILEGED AND CONFIDENTIAL INFORMATION intended solely for the use of the
> addressee(s). If you are not the intended recipient, please notify the
> sender by e-mail and delete the original message. Further, you are not to
> copy, disclose, or distribute this e-mail or its contents to any other
> person and any such actions are unlawful._

Please, get this footer removed for future contributions.

Thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] net/mlx5: optimize mprq memcpy
  2020-09-07 19:32 [dpdk-dev] [PATCH 1/3] net/mlx5: optimize mprq memcpy Aman Kumar
  2020-09-07 19:32 ` [dpdk-dev] [PATCH 2/3] net/mlx5: add non temporal store for WQE fields Aman Kumar
  2020-09-07 19:32 ` [dpdk-dev] [PATCH 3/3] config: added build config file for AMD EPYC platform Aman Kumar
@ 2020-09-25  3:16 ` Aman Kumar
  2020-09-25  3:16   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
  2020-10-10  9:00   ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2 siblings, 2 replies; 13+ messages in thread
From: Aman Kumar @ 2020-09-25  3:16 UTC (permalink / raw)
  To: dev; +Cc: rasland, keesang.song, aman.kumar

add non temporal load and temporal store for mprq memcpy.
define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
configuration to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build |   1 +
 drivers/net/mlx5/mlx5.c      |  12 ++++
 drivers/net/mlx5/mlx5.h      |   3 +
 drivers/net/mlx5/mlx5_rxq.c  |   3 +
 drivers/net/mlx5/mlx5_rxtx.c | 109 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |   3 +
 meson_options.txt            |   2 +
 7 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 9a97bb9c8..38e93fdc1 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -47,6 +47,7 @@ foreach option:cflags_options
 		cflags += option
 	endif
 endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4a807fb4f..0bb1194f7 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -160,6 +160,11 @@
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1622,6 +1627,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1682,6 +1691,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 865e72d31..0f0165884 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -234,6 +234,9 @@ struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 9f68a5cb9..2c7090c54 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
 	mprq_stride_size = non_scatter_min_mbuf_size <=
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 101555ed0..9b4fa9a27 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -123,6 +123,98 @@ uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
 uint64_t rte_net_mlx5_dynf_inline_mask;
 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+static __rte_always_inline
+void copy16B_ts(void *dst, void *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+static __rte_always_inline
+void copy32B_ts(void *dst, void *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static __rte_always_inline
+void copy64B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+}
+
+static __rte_always_inline
+void copy128B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 64));
+	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 96));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3);
+}
+
+static __rte_always_inline
+void memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
+{
+	while (len >= 128) {
+		copy128B_ts(dst, src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		copy64B_ts(dst, src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		copy32B_ts(dst, src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		copy16B_ts(dst, src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -1706,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		uintptr_t data_addr;
+#endif
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
@@ -1774,8 +1869,18 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		    rxq->mprq_repl == NULL ||
 		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+				if (!(rxq->mprq_tstore_memcpy))
+					rte_memcpy((void *)data_addr, addr, len);
+				else if ((rxq->mprq_tstore_memcpy) &&
+					   !((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK))
+					memcpy_aligned_rx_tstore_16B((void *)data_addr,
+							addr, len);
+				else
+#endif
+					rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+							addr, len);
 				DATA_LEN(pkt) = len;
 			} else if (rxq->strd_scatter_en) {
 				struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 6876c1bc4..b3c259774 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -153,6 +153,9 @@ struct mlx5_rxq_data {
 	uint32_t tunnel; /* Tunnel information. */
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
diff --git a/meson_options.txt b/meson_options.txt
index 9bf18ab6b..a4bc565d2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
 	description: 'maximum number of cores/threads supported by EAL')
 option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false,
+	description: 'to enable optimized MPRQ in RX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] net/mlx5: add non temporal store for WQE fields
  2020-09-25  3:16 ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
@ 2020-09-25  3:16   ` Aman Kumar
  2020-10-10  9:00   ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  1 sibling, 0 replies; 13+ messages in thread
From: Aman Kumar @ 2020-09-25  3:16 UTC (permalink / raw)
  To: dev; +Cc: rasland, keesang.song, aman.kumar

add non temporal store for few WQE fields to optimize
data path. Define RTE_LIBRTE_MLX5_NT_STORE in build
configurations to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build     |  1 +
 drivers/net/mlx5/mlx5.c          | 17 +++++++++++++++++
 drivers/net/mlx5/mlx5.h          |  4 ++++
 drivers/net/mlx5/mlx5_rxq.c      |  3 +++
 drivers/net/mlx5/mlx5_rxtx.c     | 20 +++++++++++++++++---
 drivers/net/mlx5/mlx5_rxtx.h     |  6 ++++++
 drivers/net/mlx5/mlx5_rxtx_vec.h | 29 ++++++++++++++++++++++++-----
 drivers/net/mlx5/mlx5_txq.c      |  3 +++
 meson_options.txt                |  2 ++
 9 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 38e93fdc1..347ca6527 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -48,6 +48,7 @@ foreach option:cflags_options
 	endif
 endforeach
 dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
+dpdk_conf.set('RTE_LIBRTE_MLX5_NT_STORE', get_option('mlx5_ntstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 0bb1194f7..af72f38a1 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -164,6 +164,13 @@
 /* mprq_tstore_memcpy */
 #define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+/* tx_wqe_field_ntstore */
+#define MLX5_TX_WQE_FIELD_NTSTORE "tx_wqe_field_ntstore"
+
+/* vec_rx_wqe_field_ntstore */
+#define MLX5_VEC_RX_WQE_FIELD_NTSTORE "vec_rx_wqe_field_ntstore"
+#endif
 
 /*
  * Device parameter to configure the total data buffer size for a single
@@ -1630,6 +1637,12 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
 		config->mprq_tstore_memcpy = tmp;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	} else if (strcmp(MLX5_TX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->tx_wqe_field_ntstore = tmp;
+	} else if (strcmp(MLX5_VEC_RX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->vec_rx_wqe_field_ntstore = tmp;
 #endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
@@ -1693,6 +1706,10 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_DECAP_EN,
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		MLX5_TX_WQE_FIELD_NTSTORE,
+		MLX5_VEC_RX_WQE_FIELD_NTSTORE,
 #endif
 		NULL,
 	};
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0f0165884..a941e9198 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -237,6 +237,10 @@ struct mlx5_dev_config {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 2c7090c54..be67a087c 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1382,6 +1382,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		tmpl->irq = 1;
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	tmpl->rxq.vec_rx_wqe_field_ntstore = config->vec_rx_wqe_field_ntstore;
 #endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9b4fa9a27..99ce20871 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -2410,6 +2410,9 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 {
 	uint16_t head = txq->elts_head;
 	unsigned int part;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint32_t flags;
+#endif
 
 	part = MLX5_TXOFF_CONFIG(INLINE) ?
 	       0 : loc->pkts_sent - loc->pkts_copy;
@@ -2423,9 +2426,20 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 		txq->elts_comp = head;
 		if (MLX5_TXOFF_CONFIG(INLINE))
 			txq->wqe_comp = txq->wqe_ci;
-		/* Request unconditional completion on last WQE. */
-		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
-					    MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (txq->tx_wqe_field_ntstore) {
+			flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+			_mm_stream_si32(((void *)(uintptr_t)&last->cseg.flags),
+					flags);
+		} else {
+#endif
+			/* Request unconditional completion on last WQE. */
+			last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 		/* Save elts_head in dedicated free on completion queue. */
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index b3c259774..86e39b91a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -156,6 +156,9 @@ struct mlx5_rxq_data {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
@@ -256,6 +259,9 @@ struct mlx5_txq_data {
 	int32_t ts_offset; /* Timestamp field dynamic offset. */
 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+#endif
 #ifndef RTE_ARCH_64
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index a8d6c4f41..413f863ba 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -86,6 +86,10 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 	volatile struct mlx5_wqe_data_seg *wq =
 		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
 	unsigned int i;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint64_t buf_addr2;
+	register uint32_t lkey_t;
+#endif
 
 	MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
 	MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
@@ -107,11 +111,26 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 		 * impact the performance.
 		 */
 		buf_addr = elts[i]->buf_addr;
-		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
-					      RTE_PKTMBUF_HEADROOM);
-		/* If there's only one MR, no need to replace LKey in WQE. */
-		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (rxq->vec_rx_wqe_field_ntstore) {
+			buf_addr2 = (uint64_t)rte_cpu_to_be_64((uintptr_t)buf_addr +
+							       RTE_PKTMBUF_HEADROOM);
+			_mm_stream_si64(((void *)(uintptr_t)&wq[i].addr), buf_addr2);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) {
+				lkey_t = (uint32_t)mlx5_rx_mb2mr(rxq, elts[i]);
+				_mm_stream_si32(((void *)(uintptr_t)&wq[i].lkey), lkey_t);
+			}
+		} else {
+#endif
+			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+					RTE_PKTMBUF_HEADROOM);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 	}
 	rxq->rq_ci += n;
 	/* Prevent overflowing into consumed mbufs. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 1bb667d46..cba675f53 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -1565,6 +1565,9 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 					    DEV_TX_OFFLOAD_UDP_TNL_TSO);
 	bool vlan_inline;
 	unsigned int temp;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	txq_ctrl->txq.tx_wqe_field_ntstore = config->tx_wqe_field_ntstore;
+#endif
 
 	if (config->txqs_inline == MLX5_ARG_UNSET)
 		txqs_inline =
diff --git a/meson_options.txt b/meson_options.txt
index a4bc565d2..21c31d57b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -32,6 +32,8 @@ option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
 option('mlx5_ntload_tstore', type: 'boolean', value: false,
 	description: 'to enable optimized MPRQ in RX datapath')
+option('mlx5_ntstore', type: 'boolean', value: false,
+	description: 'to enable optimized MLX5 TX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
  2020-09-25  3:16 ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2020-09-25  3:16   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
@ 2020-10-10  9:00   ` Aman Kumar
  2020-10-10  9:00     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
                       ` (2 more replies)
  1 sibling, 3 replies; 13+ messages in thread
From: Aman Kumar @ 2020-10-10  9:00 UTC (permalink / raw)
  To: dev
  Cc: rasland, keesang.song, asafp, shys, viacheslavo, akozyrev, matan,
	aman.kumar

add non temporal load and temporal store for mprq memcpy.
define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
configuration to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build |   1 +
 drivers/net/mlx5/mlx5.c      |  12 ++++
 drivers/net/mlx5/mlx5.h      |   3 +
 drivers/net/mlx5/mlx5_rxq.c  |   3 +
 drivers/net/mlx5/mlx5_rxtx.c | 116 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |   3 +
 meson_options.txt            |   2 +
 7 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 9a97bb9c8..38e93fdc1 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -47,6 +47,7 @@ foreach option:cflags_options
 		cflags += option
 	endif
 endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 01ead6e6a..a2796eaa5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -160,6 +160,11 @@
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1623,6 +1628,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1683,6 +1692,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 43da9a1fb..1eb305650 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -234,6 +234,9 @@ struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index c059e216d..c8db59a12 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
 	mprq_stride_size = non_scatter_min_mbuf_size <=
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 0b87be15b..f59e30d82 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -123,6 +123,97 @@ uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
 uint64_t rte_net_mlx5_dynf_inline_mask;
 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+static void copy16B_ts(void *dst, void *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+static void copy32B_ts(void *dst, void *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static void copy64B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+}
+
+static void copy128B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 64));
+	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 96));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3);
+}
+
+static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		copy128B_ts(dst, src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		copy64B_ts(dst, src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		copy32B_ts(dst, src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		copy16B_ts(dst, src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -1707,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		uintptr_t data_addr;
+#endif
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
@@ -1772,12 +1866,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
 		 * - The packet's stride overlaps a headroom and scatter is off.
 		 */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		if (unlikely(!rxq->mprq_tstore_memcpy) &&
+			len <= rxq->mprq_max_memcpy_len) {
+			rte_prefetch1(addr);
+			if (len > RTE_CACHE_LINE_SIZE)
+				rte_prefetch2((void *)((uintptr_t)addr + RTE_CACHE_LINE_SIZE));
+		}
+#endif
 		if (len <= rxq->mprq_max_memcpy_len ||
 		    rxq->mprq_repl == NULL ||
 		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+				if (!(rxq->mprq_tstore_memcpy))
+					rte_memcpy((void *)data_addr, addr, len);
+				else if ((rxq->mprq_tstore_memcpy) &&
+					   !((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK))
+					memcpy_aligned_rx_tstore_16B((void *)data_addr,
+							addr, len);
+				else
+#endif
+					rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+							addr, len);
 				DATA_LEN(pkt) = len;
 			} else if (rxq->strd_scatter_en) {
 				struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 9ffa028d2..a8ea1a795 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -153,6 +153,9 @@ struct mlx5_rxq_data {
 	uint32_t tunnel; /* Tunnel information. */
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
diff --git a/meson_options.txt b/meson_options.txt
index 9bf18ab6b..a4bc565d2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
 	description: 'maximum number of cores/threads supported by EAL')
 option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false,
+	description: 'to enable optimized MPRQ in RX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] net/mlx5: add non temporal store for WQE fields
  2020-10-10  9:00   ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
@ 2020-10-10  9:00     ` Aman Kumar
  2020-10-29  7:59     ` [dpdk-dev] [PATCH v4 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2021-02-04 14:14     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Slava Ovsiienko
  2 siblings, 0 replies; 13+ messages in thread
From: Aman Kumar @ 2020-10-10  9:00 UTC (permalink / raw)
  To: dev
  Cc: rasland, keesang.song, asafp, shys, viacheslavo, akozyrev, matan,
	aman.kumar

add non temporal store for few WQE fields to optimize
data path. Define RTE_LIBRTE_MLX5_NT_STORE in build
configurations to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build     |   1 +
 drivers/net/mlx5/mlx5.c          |  17 ++
 drivers/net/mlx5/mlx5.h          |   4 +
 drivers/net/mlx5/mlx5_rxq.c      |   3 +
 drivers/net/mlx5/mlx5_rxtx.c     | 322 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h     |   6 +
 drivers/net/mlx5/mlx5_rxtx_vec.h |  29 ++-
 drivers/net/mlx5/mlx5_txq.c      |   3 +
 meson_options.txt                |   2 +
 9 files changed, 378 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 38e93fdc1..347ca6527 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -48,6 +48,7 @@ foreach option:cflags_options
 	endif
 endforeach
 dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
+dpdk_conf.set('RTE_LIBRTE_MLX5_NT_STORE', get_option('mlx5_ntstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index a2796eaa5..01b25a109 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -164,6 +164,13 @@
 /* mprq_tstore_memcpy */
 #define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+/* tx_wqe_field_ntstore */
+#define MLX5_TX_WQE_FIELD_NTSTORE "tx_wqe_field_ntstore"
+
+/* vec_rx_wqe_field_ntstore */
+#define MLX5_VEC_RX_WQE_FIELD_NTSTORE "vec_rx_wqe_field_ntstore"
+#endif
 
 /*
  * Device parameter to configure the total data buffer size for a single
@@ -1631,6 +1638,12 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
 		config->mprq_tstore_memcpy = tmp;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	} else if (strcmp(MLX5_TX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->tx_wqe_field_ntstore = tmp;
+	} else if (strcmp(MLX5_VEC_RX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->vec_rx_wqe_field_ntstore = tmp;
 #endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
@@ -1694,6 +1707,10 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_DECAP_EN,
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		MLX5_TX_WQE_FIELD_NTSTORE,
+		MLX5_VEC_RX_WQE_FIELD_NTSTORE,
 #endif
 		NULL,
 	};
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 1eb305650..9d192465f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -237,6 +237,10 @@ struct mlx5_dev_config {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index c8db59a12..69ad9ab8c 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1382,6 +1382,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		tmpl->irq = 1;
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	tmpl->rxq.vec_rx_wqe_field_ntstore = config->vec_rx_wqe_field_ntstore;
 #endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f59e30d82..76bf20b6f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -214,6 +214,301 @@ static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
 }
 #endif
 
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+static void *amd_memcpy(void *dest, const void *src, size_t size)
+{
+	asm goto (
+	"movq	%0, %%rsi\n\t"
+	"movq	%1, %%rdi\n\t"
+	"movq	%2, %%rdx\n\t"
+	"movq    %%rdi, %%rax\n\t"
+	"cmp     $32, %%rdx\n\t"
+	"jb      less_vec\n\t"
+	"cmp     $(32 * 2), %%rdx\n\t"
+	"ja      more_2x_vec\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"less_vec:\n\t"
+	/* Less than 1 VEC.  */
+	"cmpb    $32, %%dl\n\t"
+	"jae     between_32_63\n\t"
+	"cmpb    $16, %%dl\n\t"
+	"jae     between_16_31\n\t"
+	"cmpb    $8, %%dl\n\t"
+	"jae     between_8_15\n\t"
+	"cmpb    $4, %%dl\n\t"
+	"jae     between_4_7\n\t"
+	"cmpb    $1, %%dl\n\t"
+	"ja      between_2_3\n\t"
+	"jb      1f\n\t"
+	"movzbl  (%%rsi), %%ecx\n\t"
+	"movb    %%cl, (%%rdi)\n\t"
+	"1:\n\t"
+	"jmp %l[done]\n\t"
+	"between_32_63:\n\t"
+	/* From 32 to 63.  No branch when size == 32.  */
+	"vmovdqu (%%rsi), %%ymm0\n\t"
+	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu %%ymm0, (%%rdi)\n\t"
+	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	/* From 16 to 31.  No branch when size == 16.  */
+	"between_16_31:\n\t"
+	"vmovdqu (%%rsi), %%xmm0\n\t"
+	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
+	"vmovdqu %%xmm0, (%%rdi)\n\t"
+	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
+	"jmp %l[done]\n\t"
+	"between_8_15:\n\t"
+	/* From 8 to 15.  No branch when size == 8.  */
+	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq    (%%rsi), %%rsi\n\t"
+	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq    %%rsi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"between_4_7:\n\t"
+	/* From 4 to 7.  No branch when size == 4.  */
+	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl    (%%rsi), %%esi\n\t"
+	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl    %%esi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"between_2_3:\n\t"
+	/* From 2 to 3.  No branch when size == 2.  */
+	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl  (%%rsi), %%esi\n\t"
+	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw    %%si, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"more_2x_vec:\n\t"
+	/* More than 2 * VEC and there may be overlap between destination */
+	/* and source.  */
+	"cmpq    $(32 * 8), %%rdx\n\t"
+	"ja      more_8x_vec\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"jb      last_4x_vec\n\t"
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"last_4x_vec:\n\t"
+	/* Copy from 2 * VEC to 4 * VEC. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"nop:\n\t"
+	"jmp %l[done]\n\t"
+	"more_8x_vec:\n\t"
+	"cmpq    %%rsi, %%rdi\n\t"
+	"ja      more_8x_vec_backward\n\t"
+	/* Source == destination is less common.  */
+	"je      nop\n\t"
+	/* Load the first VEC and last 4 * VEC to support overlapping addresses.  */
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
+	/* Save start and stop of the destination buffer.  */
+	"movq    %%rdi, %%r11\n\t"
+	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
+	/* Align destination for aligned stores in the loop.  Compute */
+	/* how much destination is misaligned.  */
+	"movq    %%rdi, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Get the negative of offset for alignment.  */
+	"subq    $32, %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rsi\n\t"
+	/* Adjust destination which should be aligned now.  */
+	"subq    %%r8, %%rdi\n\t"
+	/* Adjust length.  */
+	"addq    %%r8, %%rdx\n\t"
+	/* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      large_forward\n\t"
+	"loop_4x_vec_forward:\n\t"
+	/* Copy 4 * VEC a time forward.  */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32 * 4), %%rsi\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%rdi)\n\t"
+	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32 * 4), %%rdi\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      loop_4x_vec_forward\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"more_8x_vec_backward:\n\t"
+	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   32(%%rsi), %%ymm5\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
+	/* Save stop of the destination buffer.  */
+	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
+	/* Align destination end for aligned stores in the loop.  Compute */
+	/* how much destination end is misaligned.  */
+	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
+	"movq    %%r11, %%r9\n\t"
+	"movq    %%r11, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rcx\n\t"
+	/* Adjust the end of destination which should be aligned now.  */
+	"subq    %%r8, %%r9\n\t"
+	/* Adjust length.  */
+	"subq    %%r8, %%rdx\n\t"
+	 /* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      large_backward\n\t"
+	"loop_4x_vec_backward:\n\t"
+	/* Copy 4 * VEC a time backward.  */
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32 * 4), %%rcx\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%r9)\n\t"
+	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
+	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      loop_4x_vec_backward\n\t"
+	/* Store the first 4 * VEC. */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC. */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+
+	"large_forward:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded. */
+	"leaq    (%%rdi, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%rsi\n\t"
+	"jb      loop_4x_vec_forward\n\t"
+	"loop_large_forward:\n\t"
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	"prefetcht0 (32*4*2)(%%rsi)\n\t"
+	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32*4), %%rsi\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%rdi)\n\t"
+	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
+	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32*4), %%rdi\n\t"
+	"cmpq    $(32*4), %%rdx\n\t"
+	"ja      loop_large_forward\n\t"
+	"sfence\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"large_backward:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded.  */
+	"leaq    (%%rcx, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%r9\n\t"
+	"jb      loop_4x_vec_backward\n\t"
+	"loop_large_backward:\n\t"
+	/* Copy 4 * VEC a time backward with non-temporal stores. */
+	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32*4), %%rcx\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%r9)\n\t"
+	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
+	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      loop_large_backward\n\t"
+	"sfence\n\t"
+	/* Store the first 4 * VEC.  */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC.  */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]"
+	:
+	: "r"(src), "r"(dest), "r"(size)
+	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
+	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
+	: done
+	);
+done:
+	return dest;
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -2419,6 +2714,9 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 {
 	uint16_t head = txq->elts_head;
 	unsigned int part;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint32_t flags;
+#endif
 
 	part = MLX5_TXOFF_CONFIG(INLINE) ?
 	       0 : loc->pkts_sent - loc->pkts_copy;
@@ -2432,9 +2730,20 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 		txq->elts_comp = head;
 		if (MLX5_TXOFF_CONFIG(INLINE))
 			txq->wqe_comp = txq->wqe_ci;
-		/* Request unconditional completion on last WQE. */
-		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
-					    MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (txq->tx_wqe_field_ntstore) {
+			flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+			_mm_stream_si32(((void *)(uintptr_t)&last->cseg.flags),
+					flags);
+		} else {
+#endif
+			/* Request unconditional completion on last WQE. */
+			last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 		/* Save elts_head in dedicated free on completion queue. */
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
@@ -3162,7 +3471,12 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
 	part = (uint8_t *)txq->wqes_end - pdst;
 	part = RTE_MIN(part, len);
 	do {
-		rte_memcpy(pdst, buf, part);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (likely(txq->tx_wqe_field_ntstore))
+			amd_memcpy(pdst, buf, part);
+		else
+#endif
+			rte_memcpy(pdst, buf, part);
 		len -= part;
 		if (likely(!len)) {
 			pdst += part;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index a8ea1a795..f1e59a881 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -156,6 +156,9 @@ struct mlx5_rxq_data {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
@@ -256,6 +259,9 @@ struct mlx5_txq_data {
 	int32_t ts_offset; /* Timestamp field dynamic offset. */
 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+#endif
 #ifndef RTE_ARCH_64
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index a8d6c4f41..413f863ba 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -86,6 +86,10 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 	volatile struct mlx5_wqe_data_seg *wq =
 		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
 	unsigned int i;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint64_t buf_addr2;
+	register uint32_t lkey_t;
+#endif
 
 	MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
 	MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
@@ -107,11 +111,26 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 		 * impact the performance.
 		 */
 		buf_addr = elts[i]->buf_addr;
-		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
-					      RTE_PKTMBUF_HEADROOM);
-		/* If there's only one MR, no need to replace LKey in WQE. */
-		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (rxq->vec_rx_wqe_field_ntstore) {
+			buf_addr2 = (uint64_t)rte_cpu_to_be_64((uintptr_t)buf_addr +
+							       RTE_PKTMBUF_HEADROOM);
+			_mm_stream_si64(((void *)(uintptr_t)&wq[i].addr), buf_addr2);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) {
+				lkey_t = (uint32_t)mlx5_rx_mb2mr(rxq, elts[i]);
+				_mm_stream_si32(((void *)(uintptr_t)&wq[i].lkey), lkey_t);
+			}
+		} else {
+#endif
+			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+					RTE_PKTMBUF_HEADROOM);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 	}
 	rxq->rq_ci += n;
 	/* Prevent overflowing into consumed mbufs. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 1bb667d46..cba675f53 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -1565,6 +1565,9 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 					    DEV_TX_OFFLOAD_UDP_TNL_TSO);
 	bool vlan_inline;
 	unsigned int temp;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	txq_ctrl->txq.tx_wqe_field_ntstore = config->tx_wqe_field_ntstore;
+#endif
 
 	if (config->txqs_inline == MLX5_ARG_UNSET)
 		txqs_inline =
diff --git a/meson_options.txt b/meson_options.txt
index a4bc565d2..21c31d57b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -32,6 +32,8 @@ option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
 option('mlx5_ntload_tstore', type: 'boolean', value: false,
 	description: 'to enable optimized MPRQ in RX datapath')
+option('mlx5_ntstore', type: 'boolean', value: false,
+	description: 'to enable optimized MLX5 TX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] net/mlx5: optimize mprq memcpy
  2020-10-10  9:00   ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2020-10-10  9:00     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
@ 2020-10-29  7:59     ` Aman Kumar
  2020-10-29  7:59       ` [dpdk-dev] [PATCH v4 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
  2021-02-04 14:14     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Slava Ovsiienko
  2 siblings, 1 reply; 13+ messages in thread
From: Aman Kumar @ 2020-10-29  7:59 UTC (permalink / raw)
  To: dev
  Cc: rasland, keesang.song, aman.kumar, asafp, shys, viacheslavo,
	akozyrev, matan

add non temporal load and temporal store for mprq memcpy.
define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
configuration to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build |   1 +
 drivers/net/mlx5/mlx5.c      |  12 ++++
 drivers/net/mlx5/mlx5.h      |   3 +
 drivers/net/mlx5/mlx5_rxq.c  |   3 +
 drivers/net/mlx5/mlx5_rxtx.c | 116 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |   3 +
 meson_options.txt            |   2 +
 7 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 9a97bb9c8..38e93fdc1 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -47,6 +47,7 @@ foreach option:cflags_options
 		cflags += option
 	endif
 endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 77aeac85c..a0913e161 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -160,6 +160,11 @@
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1655,6 +1660,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1715,6 +1724,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 1408cf94d..42934f6ca 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -236,6 +236,9 @@ struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index f1d837307..59b635e0b 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
 	mprq_stride_size = non_scatter_min_mbuf_size <=
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b530ff421..761dc88f3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -121,6 +121,97 @@ uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
 uint64_t rte_net_mlx5_dynf_inline_mask;
 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+static void copy16B_ts(void *dst, void *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+static void copy32B_ts(void *dst, void *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static void copy64B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+}
+
+static void copy128B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 64));
+	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 96));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3);
+}
+
+static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		copy128B_ts(dst, src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		copy64B_ts(dst, src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		copy32B_ts(dst, src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		copy16B_ts(dst, src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -1611,6 +1702,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		uintptr_t data_addr;
+#endif
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
@@ -1676,12 +1770,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
 		 * - The packet's stride overlaps a headroom and scatter is off.
 		 */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		if (unlikely(!rxq->mprq_tstore_memcpy) &&
+			len <= rxq->mprq_max_memcpy_len) {
+			rte_prefetch1(addr);
+			if (len > RTE_CACHE_LINE_SIZE)
+				rte_prefetch2((void *)((uintptr_t)addr + RTE_CACHE_LINE_SIZE));
+		}
+#endif
 		if (len <= rxq->mprq_max_memcpy_len ||
 		    rxq->mprq_repl == NULL ||
 		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+				if (!(rxq->mprq_tstore_memcpy))
+					rte_memcpy((void *)data_addr, addr, len);
+				else if ((rxq->mprq_tstore_memcpy) &&
+					   !((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK))
+					memcpy_aligned_rx_tstore_16B((void *)data_addr,
+							addr, len);
+				else
+#endif
+					rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+							addr, len);
 				DATA_LEN(pkt) = len;
 			} else if (rxq->strd_scatter_en) {
 				struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 674296ee9..750371014 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -153,6 +153,9 @@ struct mlx5_rxq_data {
 	uint32_t tunnel; /* Tunnel information. */
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
diff --git a/meson_options.txt b/meson_options.txt
index 9bf18ab6b..a4bc565d2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
 	description: 'maximum number of cores/threads supported by EAL')
 option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false,
+	description: 'to enable optimized MPRQ in RX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] net/mlx5: add non temporal store for WQE fields
  2020-10-29  7:59     ` [dpdk-dev] [PATCH v4 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
@ 2020-10-29  7:59       ` Aman Kumar
  0 siblings, 0 replies; 13+ messages in thread
From: Aman Kumar @ 2020-10-29  7:59 UTC (permalink / raw)
  To: dev
  Cc: rasland, keesang.song, aman.kumar, asafp, shys, viacheslavo,
	akozyrev, matan

add non temporal store for few WQE fields to optimize
data path. Define RTE_LIBRTE_MLX5_NT_STORE in build
configurations to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build     |   1 +
 drivers/net/mlx5/mlx5.c          |  17 ++
 drivers/net/mlx5/mlx5.h          |   4 +
 drivers/net/mlx5/mlx5_rxq.c      |   3 +
 drivers/net/mlx5/mlx5_rxtx.c     | 321 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h     |   6 +
 drivers/net/mlx5/mlx5_rxtx_vec.h |  29 ++-
 drivers/net/mlx5/mlx5_txq.c      |   3 +
 meson_options.txt                |   2 +
 9 files changed, 377 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 38e93fdc1..347ca6527 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -48,6 +48,7 @@ foreach option:cflags_options
 	endif
 endforeach
 dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
+dpdk_conf.set('RTE_LIBRTE_MLX5_NT_STORE', get_option('mlx5_ntstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index a0913e161..04c8a7c08 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -164,6 +164,13 @@
 /* mprq_tstore_memcpy */
 #define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+/* tx_wqe_field_ntstore */
+#define MLX5_TX_WQE_FIELD_NTSTORE "tx_wqe_field_ntstore"
+
+/* vec_rx_wqe_field_ntstore */
+#define MLX5_VEC_RX_WQE_FIELD_NTSTORE "vec_rx_wqe_field_ntstore"
+#endif
 
 /*
  * Device parameter to configure the total data buffer size for a single
@@ -1663,6 +1670,12 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
 		config->mprq_tstore_memcpy = tmp;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	} else if (strcmp(MLX5_TX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->tx_wqe_field_ntstore = tmp;
+	} else if (strcmp(MLX5_VEC_RX_WQE_FIELD_NTSTORE, key) == 0) {
+		config->vec_rx_wqe_field_ntstore = tmp;
 #endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
@@ -1726,6 +1739,10 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_DECAP_EN,
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		MLX5_TX_WQE_FIELD_NTSTORE,
+		MLX5_VEC_RX_WQE_FIELD_NTSTORE,
 #endif
 		NULL,
 	};
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 42934f6ca..978a9d697 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -239,6 +239,10 @@ struct mlx5_dev_config {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 59b635e0b..4a8f301e1 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1382,6 +1382,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		tmpl->irq = 1;
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	tmpl->rxq.vec_rx_wqe_field_ntstore = config->vec_rx_wqe_field_ntstore;
 #endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 761dc88f3..84f6fea9e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -212,6 +212,300 @@ static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
 }
 #endif
 
+#if defined(RTE_ARCH_X86_64) && defined(RTE_LIBRTE_MLX5_NT_STORE)
+static void *amd_memcpy(void *dest, const void *src, size_t size)
+{
+	asm goto("movq	%0, %%rsi\n\t"
+	"movq	%1, %%rdi\n\t"
+	"movq	%2, %%rdx\n\t"
+	"movq    %%rdi, %%rax\n\t"
+	"cmp     $32, %%rdx\n\t"
+	"jb      less_vec\n\t"
+	"cmp     $(32 * 2), %%rdx\n\t"
+	"ja      more_2x_vec\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"less_vec:\n\t"
+	/* Less than 1 VEC.  */
+	"cmpb    $32, %%dl\n\t"
+	"jae     between_32_63\n\t"
+	"cmpb    $16, %%dl\n\t"
+	"jae     between_16_31\n\t"
+	"cmpb    $8, %%dl\n\t"
+	"jae     between_8_15\n\t"
+	"cmpb    $4, %%dl\n\t"
+	"jae     between_4_7\n\t"
+	"cmpb    $1, %%dl\n\t"
+	"ja      between_2_3\n\t"
+	"jb      1f\n\t"
+	"movzbl  (%%rsi), %%ecx\n\t"
+	"movb    %%cl, (%%rdi)\n\t"
+	"1:\n\t"
+	"jmp %l[done]\n\t"
+	"between_32_63:\n\t"
+	/* From 32 to 63.  No branch when size == 32.  */
+	"vmovdqu (%%rsi), %%ymm0\n\t"
+	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu %%ymm0, (%%rdi)\n\t"
+	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	/* From 16 to 31.  No branch when size == 16.  */
+	"between_16_31:\n\t"
+	"vmovdqu (%%rsi), %%xmm0\n\t"
+	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
+	"vmovdqu %%xmm0, (%%rdi)\n\t"
+	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
+	"jmp %l[done]\n\t"
+	"between_8_15:\n\t"
+	/* From 8 to 15.  No branch when size == 8.  */
+	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq    (%%rsi), %%rsi\n\t"
+	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq    %%rsi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"between_4_7:\n\t"
+	/* From 4 to 7.  No branch when size == 4.  */
+	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl    (%%rsi), %%esi\n\t"
+	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl    %%esi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"between_2_3:\n\t"
+	/* From 2 to 3.  No branch when size == 2.  */
+	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl  (%%rsi), %%esi\n\t"
+	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw    %%si, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"more_2x_vec:\n\t"
+	/* More than 2 * VEC and there may be overlap between destination */
+	/* and source.  */
+	"cmpq    $(32 * 8), %%rdx\n\t"
+	"ja      more_8x_vec\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"jb      last_4x_vec\n\t"
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"last_4x_vec:\n\t"
+	/* Copy from 2 * VEC to 4 * VEC. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"nop:\n\t"
+	"jmp %l[done]\n\t"
+	"more_8x_vec:\n\t"
+	"cmpq    %%rsi, %%rdi\n\t"
+	"ja      more_8x_vec_backward\n\t"
+	/* Source == destination is less common.  */
+	"je      nop\n\t"
+	/* Load the first VEC and last 4 * VEC to support overlapping addresses.  */
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
+	/* Save start and stop of the destination buffer.  */
+	"movq    %%rdi, %%r11\n\t"
+	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
+	/* Align destination for aligned stores in the loop.  Compute */
+	/* how much destination is misaligned.  */
+	"movq    %%rdi, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Get the negative of offset for alignment.  */
+	"subq    $32, %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rsi\n\t"
+	/* Adjust destination which should be aligned now.  */
+	"subq    %%r8, %%rdi\n\t"
+	/* Adjust length.  */
+	"addq    %%r8, %%rdx\n\t"
+	/* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      large_forward\n\t"
+	"loop_4x_vec_forward:\n\t"
+	/* Copy 4 * VEC a time forward.  */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32 * 4), %%rsi\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%rdi)\n\t"
+	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32 * 4), %%rdi\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      loop_4x_vec_forward\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"more_8x_vec_backward:\n\t"
+	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   32(%%rsi), %%ymm5\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
+	/* Save stop of the destination buffer.  */
+	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
+	/* Align destination end for aligned stores in the loop.  Compute */
+	/* how much destination end is misaligned.  */
+	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
+	"movq    %%r11, %%r9\n\t"
+	"movq    %%r11, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rcx\n\t"
+	/* Adjust the end of destination which should be aligned now.  */
+	"subq    %%r8, %%r9\n\t"
+	/* Adjust length.  */
+	"subq    %%r8, %%rdx\n\t"
+	 /* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      large_backward\n\t"
+	"loop_4x_vec_backward:\n\t"
+	/* Copy 4 * VEC a time backward.  */
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32 * 4), %%rcx\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%r9)\n\t"
+	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
+	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      loop_4x_vec_backward\n\t"
+	/* Store the first 4 * VEC. */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC. */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+
+	"large_forward:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded. */
+	"leaq    (%%rdi, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%rsi\n\t"
+	"jb      loop_4x_vec_forward\n\t"
+	"loop_large_forward:\n\t"
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	"prefetcht0 (32*4*2)(%%rsi)\n\t"
+	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32*4), %%rsi\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%rdi)\n\t"
+	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
+	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32*4), %%rdi\n\t"
+	"cmpq    $(32*4), %%rdx\n\t"
+	"ja      loop_large_forward\n\t"
+	"sfence\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"large_backward:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded.  */
+	"leaq    (%%rcx, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%r9\n\t"
+	"jb      loop_4x_vec_backward\n\t"
+	"loop_large_backward:\n\t"
+	/* Copy 4 * VEC a time backward with non-temporal stores. */
+	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32*4), %%rcx\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%r9)\n\t"
+	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
+	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      loop_large_backward\n\t"
+	"sfence\n\t"
+	/* Store the first 4 * VEC.  */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC.  */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]"
+	:
+	: "r"(src), "r"(dest), "r"(size)
+	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
+	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
+	: done
+	);
+done:
+	return dest;
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -2323,6 +2617,9 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 {
 	uint16_t head = txq->elts_head;
 	unsigned int part;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint32_t flags;
+#endif
 
 	part = MLX5_TXOFF_CONFIG(INLINE) ?
 	       0 : loc->pkts_sent - loc->pkts_copy;
@@ -2336,9 +2633,20 @@ mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
 		txq->elts_comp = head;
 		if (MLX5_TXOFF_CONFIG(INLINE))
 			txq->wqe_comp = txq->wqe_ci;
-		/* Request unconditional completion on last WQE. */
-		last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
-					    MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (txq->tx_wqe_field_ntstore) {
+			flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+			_mm_stream_si32(((void *)(uintptr_t)&last->cseg.flags),
+					flags);
+		} else {
+#endif
+			/* Request unconditional completion on last WQE. */
+			last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+					MLX5_COMP_MODE_OFFSET);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 		/* Save elts_head in dedicated free on completion queue. */
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 		txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
@@ -3147,7 +3455,12 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
 	part = (uint8_t *)txq->wqes_end - pdst;
 	part = RTE_MIN(part, len);
 	do {
-		rte_memcpy(pdst, buf, part);
+#if defined(RTE_ARCH_X86_64) && defined(RTE_LIBRTE_MLX5_NT_STORE)
+		if (likely(txq->tx_wqe_field_ntstore))
+			amd_memcpy(pdst, buf, part);
+		else
+#endif
+			rte_memcpy(pdst, buf, part);
 		len -= part;
 		if (likely(!len)) {
 			pdst += part;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 750371014..11fd0918b 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -156,6 +156,9 @@ struct mlx5_rxq_data {
 #ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
 	unsigned int mprq_tstore_memcpy:1;
 #endif
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int vec_rx_wqe_field_ntstore:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
@@ -256,6 +259,9 @@ struct mlx5_txq_data {
 	int32_t ts_offset; /* Timestamp field dynamic offset. */
 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	unsigned int tx_wqe_field_ntstore:1;
+#endif
 #ifndef RTE_ARCH_64
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index a8d6c4f41..413f863ba 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -86,6 +86,10 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 	volatile struct mlx5_wqe_data_seg *wq =
 		&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];
 	unsigned int i;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	register uint64_t buf_addr2;
+	register uint32_t lkey_t;
+#endif
 
 	MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));
 	MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi)));
@@ -107,11 +111,26 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 		 * impact the performance.
 		 */
 		buf_addr = elts[i]->buf_addr;
-		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
-					      RTE_PKTMBUF_HEADROOM);
-		/* If there's only one MR, no need to replace LKey in WQE. */
-		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		if (rxq->vec_rx_wqe_field_ntstore) {
+			buf_addr2 = (uint64_t)rte_cpu_to_be_64((uintptr_t)buf_addr +
+							       RTE_PKTMBUF_HEADROOM);
+			_mm_stream_si64(((void *)(uintptr_t)&wq[i].addr), buf_addr2);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) {
+				lkey_t = (uint32_t)mlx5_rx_mb2mr(rxq, elts[i]);
+				_mm_stream_si32(((void *)(uintptr_t)&wq[i].lkey), lkey_t);
+			}
+		} else {
+#endif
+			wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
+					RTE_PKTMBUF_HEADROOM);
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+				wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+		}
+#endif
 	}
 	rxq->rq_ci += n;
 	/* Prevent overflowing into consumed mbufs. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index af84f5f72..f9160f551 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -779,6 +779,9 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl)
 					    DEV_TX_OFFLOAD_UDP_TNL_TSO);
 	bool vlan_inline;
 	unsigned int temp;
+#ifdef RTE_LIBRTE_MLX5_NT_STORE
+	txq_ctrl->txq.tx_wqe_field_ntstore = config->tx_wqe_field_ntstore;
+#endif
 
 	if (config->txqs_inline == MLX5_ARG_UNSET)
 		txqs_inline =
diff --git a/meson_options.txt b/meson_options.txt
index a4bc565d2..21c31d57b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -32,6 +32,8 @@ option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
 option('mlx5_ntload_tstore', type: 'boolean', value: false,
 	description: 'to enable optimized MPRQ in RX datapath')
+option('mlx5_ntstore', type: 'boolean', value: false,
+	description: 'to enable optimized MLX5 TX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
  2020-10-10  9:00   ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
  2020-10-10  9:00     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
  2020-10-29  7:59     ` [dpdk-dev] [PATCH v4 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
@ 2021-02-04 14:14     ` Slava Ovsiienko
  2021-02-09  6:22       ` Aman Kumar
  2 siblings, 1 reply; 13+ messages in thread
From: Slava Ovsiienko @ 2021-02-04 14:14 UTC (permalink / raw)
  To: Aman Kumar, dev
  Cc: Raslan Darawsheh, keesang.song, Asaf Penso, Shy Shyman,
	Alexander Kozyrev, Matan Azrad

Hi,

I'm sorry for asking the questions very late.
Is still this patch set actual and should it be updated and considered? 

As I can understand this one optimizes the memory writes in some way using the instructions with the hints.
Is this specific for some CPU families? Is this more common? I suppose it should we considered and discussed
more widely, possible on EAL level. I would propose to introduce these special memory routines on EAL level
to give advantage to all PMDs, not specifically to mlx5.

With best regards,
Slava

> -----Original Message-----
> From: Aman Kumar <aman.kumar@vvdntech.in>
> Sent: Saturday, October 10, 2020 12:01
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; keesang.song@amd.com;
> Asaf Penso <asafp@nvidia.com>; Shy Shyman <shys@nvidia.com>; Slava
> Ovsiienko <viacheslavo@nvidia.com>; Alexander Kozyrev
> <akozyrev@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> aman.kumar@vvdntech.in
> Subject: [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
> 
> add non temporal load and temporal store for mprq memcpy.
> define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
> configuration to enable this optimization.
> 
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> ---
>  drivers/net/mlx5/meson.build |   1 +
>  drivers/net/mlx5/mlx5.c      |  12 ++++
>  drivers/net/mlx5/mlx5.h      |   3 +
>  drivers/net/mlx5/mlx5_rxq.c  |   3 +
>  drivers/net/mlx5/mlx5_rxtx.c | 116
> ++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx5/mlx5_rxtx.h |   3 +
>  meson_options.txt            |   2 +
>  7 files changed, 138 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
> index 9a97bb9c8..38e93fdc1 100644
> --- a/drivers/net/mlx5/meson.build
> +++ b/drivers/net/mlx5/meson.build
> @@ -47,6 +47,7 @@ foreach option:cflags_options
>  		cflags += option
>  	endif
>  endforeach
> +dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY',
> +get_option('mlx5_ntload_tstore'))
>  if get_option('buildtype').contains('debug')
>  	cflags += [ '-pedantic', '-DPEDANTIC' ]  else diff --git
> a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 01ead6e6a..a2796eaa5 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -160,6 +160,11 @@
>  /* Configure timeout of LRO session (in microseconds). */  #define
> MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
> 
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +/* mprq_tstore_memcpy */
> +#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
> +#endif
> +
>  /*
>   * Device parameter to configure the total data buffer size for a single
>   * hairpin queue (logarithm value).
> @@ -1623,6 +1628,10 @@ mlx5_args_check(const char *key, const char
> *val, void *opaque)
>  		config->sys_mem_en = !!tmp;
>  	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
>  		config->decap_en = !!tmp;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
> +		config->mprq_tstore_memcpy = tmp;
> +#endif
>  	} else {
>  		DRV_LOG(WARNING, "%s: unknown parameter", key);
>  		rte_errno = EINVAL;
> @@ -1683,6 +1692,9 @@ mlx5_args(struct mlx5_dev_config *config, struct
> rte_devargs *devargs)
>  		MLX5_RECLAIM_MEM,
>  		MLX5_SYS_MEM_EN,
>  		MLX5_DECAP_EN,
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +		MLX5_MPRQ_TSTORE_MEMCPY,
> +#endif
>  		NULL,
>  	};
>  	struct rte_kvargs *kvlist;
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 43da9a1fb..1eb305650 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -234,6 +234,9 @@ struct mlx5_dev_config {
>  	int tx_skew; /* Tx scheduling skew between WQE and data on wire.
> */
>  	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
>  	struct mlx5_lro_config lro; /* LRO configuration. */
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	unsigned int mprq_tstore_memcpy:1;
> +#endif
>  };
> 
> 
> diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index
> c059e216d..c8db59a12 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>  	tmpl->socket = socket;
>  	if (dev->data->dev_conf.intr_conf.rxq)
>  		tmpl->irq = 1;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
> #endif
>  	mprq_stride_nums = config->mprq.stride_num_n ?
>  		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
>  	mprq_stride_size = non_scatter_min_mbuf_size <= diff --git
> a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> 0b87be15b..f59e30d82 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -123,6 +123,97 @@ uint8_t mlx5_swp_types_table[1 << 10]
> __rte_cache_aligned;  uint64_t rte_net_mlx5_dynf_inline_mask;  #define
> PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
> 
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +static void copy16B_ts(void *dst, void *src) {
> +	__m128i var128;
> +
> +	var128 = _mm_stream_load_si128((__m128i *)src);
> +	_mm_storeu_si128((__m128i *)dst, var128); }
> +
> +static void copy32B_ts(void *dst, void *src) {
> +	__m256i ymm0;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	_mm256_storeu_si256((__m256i *)dst, ymm0); }
> +
> +static void copy64B_ts(void *dst, void *src) {
> +	__m256i ymm0, ymm1;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 32));
> +	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1); }
> +
> +static void copy128B_ts(void *dst, void *src) {
> +	__m256i ymm0, ymm1, ymm2, ymm3;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 32));
> +	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 64));
> +	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 96));
> +	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3); }
> +
> +static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int
> +len) {
> +	void *dest = dst;
> +
> +	while (len >= 128) {
> +		copy128B_ts(dst, src);
> +		dst = (uint8_t *)dst + 128;
> +		src = (uint8_t *)src + 128;
> +		len -= 128;
> +	}
> +	while (len >= 64) {
> +		copy64B_ts(dst, src);
> +		dst = (uint8_t *)dst + 64;
> +		src = (uint8_t *)src + 64;
> +		len -= 64;
> +	}
> +	while (len >= 32) {
> +		copy32B_ts(dst, src);
> +		dst = (uint8_t *)dst + 32;
> +		src = (uint8_t *)src + 32;
> +		len -= 32;
> +	}
> +	if (len >= 16) {
> +		copy16B_ts(dst, src);
> +		dst = (uint8_t *)dst + 16;
> +		src = (uint8_t *)src + 16;
> +		len -= 16;
> +	}
> +	if (len >= 8) {
> +		*(uint64_t *)dst = *(const uint64_t *)src;
> +		dst = (uint8_t *)dst + 8;
> +		src = (uint8_t *)src + 8;
> +		len -= 8;
> +	}
> +	if (len >= 4) {
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +		dst = (uint8_t *)dst + 4;
> +		src = (uint8_t *)src + 4;
> +		len -= 4;
> +	}
> +	if (len != 0) {
> +		dst = (uint8_t *)dst - (4 - len);
> +		src = (uint8_t *)src - (4 - len);
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +	}
> +
> +	return dest;
> +}
> +#endif
> +
>  /**
>   * Build a table to translate Rx completion flags to packet type.
>   *
> @@ -1707,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
>  		int32_t hdrm_overlap;
>  		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
>  		uint32_t rss_hash_res = 0;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +		uintptr_t data_addr;
> +#endif
> 
>  		if (consumed_strd == strd_n) {
>  			/* Replace WQE only if the buffer is still in use. */
> @@ -1772,12 +1866,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
>  		 * - Out of buffer in the Mempool for Multi-Packet RQ.
>  		 * - The packet's stride overlaps a headroom and scatter is
> off.
>  		 */
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +		if (unlikely(!rxq->mprq_tstore_memcpy) &&
> +			len <= rxq->mprq_max_memcpy_len) {
> +			rte_prefetch1(addr);
> +			if (len > RTE_CACHE_LINE_SIZE)
> +				rte_prefetch2((void *)((uintptr_t)addr +
> RTE_CACHE_LINE_SIZE));
> +		}
> +#endif
>  		if (len <= rxq->mprq_max_memcpy_len ||
>  		    rxq->mprq_repl == NULL ||
>  		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
>  			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
> -				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
> -					   addr, len);
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +				data_addr =
> (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
> +				if (!(rxq->mprq_tstore_memcpy))
> +					rte_memcpy((void *)data_addr,
> addr, len);
> +				else if ((rxq->mprq_tstore_memcpy) &&
> +					   !((data_addr | (uintptr_t)addr) &
> ALIGNMENT_MASK))
> +
> 	memcpy_aligned_rx_tstore_16B((void *)data_addr,
> +							addr, len);
> +				else
> +#endif
> +					rte_memcpy(rte_pktmbuf_mtod(pkt,
> void *),
> +							addr, len);
>  				DATA_LEN(pkt) = len;
>  			} else if (rxq->strd_scatter_en) {
>  				struct rte_mbuf *prev = pkt;
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 9ffa028d2..a8ea1a795 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -153,6 +153,9 @@ struct mlx5_rxq_data {
>  	uint32_t tunnel; /* Tunnel information. */
>  	uint64_t flow_meta_mask;
>  	int32_t flow_meta_offset;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	unsigned int mprq_tstore_memcpy:1;
> +#endif
>  } __rte_cache_aligned;
> 
>  enum mlx5_rxq_type {
> diff --git a/meson_options.txt b/meson_options.txt index
> 9bf18ab6b..a4bc565d2 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
>  	description: 'maximum number of cores/threads supported by EAL')
> option('max_numa_nodes', type: 'integer', value: 4,
>  	description: 'maximum number of NUMA nodes supported by EAL')
> +option('mlx5_ntload_tstore', type: 'boolean', value: false,
> +	description: 'to enable optimized MPRQ in RX datapath')
>  option('enable_trace_fp', type: 'boolean', value: false,
>  	description: 'enable fast path trace points.')  option('tests', type:
> 'boolean', value: true,
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
  2021-02-04 14:14     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Slava Ovsiienko
@ 2021-02-09  6:22       ` Aman Kumar
  0 siblings, 0 replies; 13+ messages in thread
From: Aman Kumar @ 2021-02-09  6:22 UTC (permalink / raw)
  To: Slava Ovsiienko
  Cc: dev, Raslan Darawsheh, keesang.song, Asaf Penso, Shy Shyman,
	Alexander Kozyrev, Matan Azrad

Hi Slava,

Thank you for your reply.
This is currently supported (and tested) on 2nd gen AMD EPYC series
processors. We're currently trying to make it more generic across other
EPYC platforms. We too believe having these available at EAL may also help
applications and other PMDs. I'll move memory copy instructions to
lib/librte_eal/* and update this patchset.
Thanks.

*Regards*
Aman Kumar

On Thu, Feb 4, 2021 at 7:45 PM Slava Ovsiienko <viacheslavo@nvidia.com>
wrote:

> Hi,
>
> I'm sorry for asking the questions very late.
> Is still this patch set actual and should it be updated and considered?
>
> As I can understand this one optimizes the memory writes in some way using
> the instructions with the hints.
> Is this specific for some CPU families? Is this more common? I suppose it
> should we considered and discussed
> more widely, possible on EAL level. I would propose to introduce these
> special memory routines on EAL level
> to give advantage to all PMDs, not specifically to mlx5.
>
> With best regards,
> Slava
>
> > -----Original Message-----
> > From: Aman Kumar <aman.kumar@vvdntech.in>
> > Sent: Saturday, October 10, 2020 12:01
> > To: dev@dpdk.org
> > Cc: Raslan Darawsheh <rasland@nvidia.com>; keesang.song@amd.com;
> > Asaf Penso <asafp@nvidia.com>; Shy Shyman <shys@nvidia.com>; Slava
> > Ovsiienko <viacheslavo@nvidia.com>; Alexander Kozyrev
> > <akozyrev@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> > aman.kumar@vvdntech.in
> > Subject: [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
> >
> > add non temporal load and temporal store for mprq memcpy.
> > define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
> > configuration to enable this optimization.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> > ---
> >  drivers/net/mlx5/meson.build |   1 +
> >  drivers/net/mlx5/mlx5.c      |  12 ++++
> >  drivers/net/mlx5/mlx5.h      |   3 +
> >  drivers/net/mlx5/mlx5_rxq.c  |   3 +
> >  drivers/net/mlx5/mlx5_rxtx.c | 116
> > ++++++++++++++++++++++++++++++++++-
> >  drivers/net/mlx5/mlx5_rxtx.h |   3 +
> >  meson_options.txt            |   2 +
> >  7 files changed, 138 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
> > index 9a97bb9c8..38e93fdc1 100644
> > --- a/drivers/net/mlx5/meson.build
> > +++ b/drivers/net/mlx5/meson.build
> > @@ -47,6 +47,7 @@ foreach option:cflags_options
> >               cflags += option
> >       endif
> >  endforeach
> > +dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY',
> > +get_option('mlx5_ntload_tstore'))
> >  if get_option('buildtype').contains('debug')
> >       cflags += [ '-pedantic', '-DPEDANTIC' ]  else diff --git
> > a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 01ead6e6a..a2796eaa5 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -160,6 +160,11 @@
> >  /* Configure timeout of LRO session (in microseconds). */  #define
> > MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
> >
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +/* mprq_tstore_memcpy */
> > +#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
> > +#endif
> > +
> >  /*
> >   * Device parameter to configure the total data buffer size for a single
> >   * hairpin queue (logarithm value).
> > @@ -1623,6 +1628,10 @@ mlx5_args_check(const char *key, const char
> > *val, void *opaque)
> >               config->sys_mem_en = !!tmp;
> >       } else if (strcmp(MLX5_DECAP_EN, key) == 0) {
> >               config->decap_en = !!tmp;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     } else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
> > +             config->mprq_tstore_memcpy = tmp;
> > +#endif
> >       } else {
> >               DRV_LOG(WARNING, "%s: unknown parameter", key);
> >               rte_errno = EINVAL;
> > @@ -1683,6 +1692,9 @@ mlx5_args(struct mlx5_dev_config *config, struct
> > rte_devargs *devargs)
> >               MLX5_RECLAIM_MEM,
> >               MLX5_SYS_MEM_EN,
> >               MLX5_DECAP_EN,
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +             MLX5_MPRQ_TSTORE_MEMCPY,
> > +#endif
> >               NULL,
> >       };
> >       struct rte_kvargs *kvlist;
> > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 43da9a1fb..1eb305650 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -234,6 +234,9 @@ struct mlx5_dev_config {
> >       int tx_skew; /* Tx scheduling skew between WQE and data on wire.
> > */
> >       struct mlx5_hca_attr hca_attr; /* HCA attributes. */
> >       struct mlx5_lro_config lro; /* LRO configuration. */
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     unsigned int mprq_tstore_memcpy:1;
> > +#endif
> >  };
> >
> >
> > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> index
> > c059e216d..c8db59a12 100644
> > --- a/drivers/net/mlx5/mlx5_rxq.c
> > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > @@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> > idx, uint16_t desc,
> >       tmpl->socket = socket;
> >       if (dev->data->dev_conf.intr_conf.rxq)
> >               tmpl->irq = 1;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
> > #endif
> >       mprq_stride_nums = config->mprq.stride_num_n ?
> >               config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
> >       mprq_stride_size = non_scatter_min_mbuf_size <= diff --git
> > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > 0b87be15b..f59e30d82 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > @@ -123,6 +123,97 @@ uint8_t mlx5_swp_types_table[1 << 10]
> > __rte_cache_aligned;  uint64_t rte_net_mlx5_dynf_inline_mask;  #define
> > PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
> >
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +static void copy16B_ts(void *dst, void *src) {
> > +     __m128i var128;
> > +
> > +     var128 = _mm_stream_load_si128((__m128i *)src);
> > +     _mm_storeu_si128((__m128i *)dst, var128); }
> > +
> > +static void copy32B_ts(void *dst, void *src) {
> > +     __m256i ymm0;
> > +
> > +     ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +     _mm256_storeu_si256((__m256i *)dst, ymm0); }
> > +
> > +static void copy64B_ts(void *dst, void *src) {
> > +     __m256i ymm0, ymm1;
> > +
> > +     ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +     ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 32));
> > +     _mm256_storeu_si256((__m256i *)dst, ymm0);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1); }
> > +
> > +static void copy128B_ts(void *dst, void *src) {
> > +     __m256i ymm0, ymm1, ymm2, ymm3;
> > +
> > +     ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +     ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 32));
> > +     ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 64));
> > +     ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 96));
> > +     _mm256_storeu_si256((__m256i *)dst, ymm0);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3); }
> > +
> > +static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int
> > +len) {
> > +     void *dest = dst;
> > +
> > +     while (len >= 128) {
> > +             copy128B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 128;
> > +             src = (uint8_t *)src + 128;
> > +             len -= 128;
> > +     }
> > +     while (len >= 64) {
> > +             copy64B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 64;
> > +             src = (uint8_t *)src + 64;
> > +             len -= 64;
> > +     }
> > +     while (len >= 32) {
> > +             copy32B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 32;
> > +             src = (uint8_t *)src + 32;
> > +             len -= 32;
> > +     }
> > +     if (len >= 16) {
> > +             copy16B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 16;
> > +             src = (uint8_t *)src + 16;
> > +             len -= 16;
> > +     }
> > +     if (len >= 8) {
> > +             *(uint64_t *)dst = *(const uint64_t *)src;
> > +             dst = (uint8_t *)dst + 8;
> > +             src = (uint8_t *)src + 8;
> > +             len -= 8;
> > +     }
> > +     if (len >= 4) {
> > +             *(uint32_t *)dst = *(const uint32_t *)src;
> > +             dst = (uint8_t *)dst + 4;
> > +             src = (uint8_t *)src + 4;
> > +             len -= 4;
> > +     }
> > +     if (len != 0) {
> > +             dst = (uint8_t *)dst - (4 - len);
> > +             src = (uint8_t *)src - (4 - len);
> > +             *(uint32_t *)dst = *(const uint32_t *)src;
> > +     }
> > +
> > +     return dest;
> > +}
> > +#endif
> > +
> >  /**
> >   * Build a table to translate Rx completion flags to packet type.
> >   *
> > @@ -1707,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >               int32_t hdrm_overlap;
> >               volatile struct mlx5_mini_cqe8 *mcqe = NULL;
> >               uint32_t rss_hash_res = 0;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +             uintptr_t data_addr;
> > +#endif
> >
> >               if (consumed_strd == strd_n) {
> >                       /* Replace WQE only if the buffer is still in use.
> */
> > @@ -1772,12 +1866,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >                * - Out of buffer in the Mempool for Multi-Packet RQ.
> >                * - The packet's stride overlaps a headroom and scatter is
> > off.
> >                */
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +             if (unlikely(!rxq->mprq_tstore_memcpy) &&
> > +                     len <= rxq->mprq_max_memcpy_len) {
> > +                     rte_prefetch1(addr);
> > +                     if (len > RTE_CACHE_LINE_SIZE)
> > +                             rte_prefetch2((void *)((uintptr_t)addr +
> > RTE_CACHE_LINE_SIZE));
> > +             }
> > +#endif
> >               if (len <= rxq->mprq_max_memcpy_len ||
> >                   rxq->mprq_repl == NULL ||
> >                   (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
> >                       if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
> > -                             rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
> > -                                        addr, len);
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +                             data_addr =
> > (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
> > +                             if (!(rxq->mprq_tstore_memcpy))
> > +                                     rte_memcpy((void *)data_addr,
> > addr, len);
> > +                             else if ((rxq->mprq_tstore_memcpy) &&
> > +                                        !((data_addr | (uintptr_t)addr)
> &
> > ALIGNMENT_MASK))
> > +
> >       memcpy_aligned_rx_tstore_16B((void *)data_addr,
> > +                                                     addr, len);
> > +                             else
> > +#endif
> > +                                     rte_memcpy(rte_pktmbuf_mtod(pkt,
> > void *),
> > +                                                     addr, len);
> >                               DATA_LEN(pkt) = len;
> >                       } else if (rxq->strd_scatter_en) {
> >                               struct rte_mbuf *prev = pkt;
> > diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > index 9ffa028d2..a8ea1a795 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > @@ -153,6 +153,9 @@ struct mlx5_rxq_data {
> >       uint32_t tunnel; /* Tunnel information. */
> >       uint64_t flow_meta_mask;
> >       int32_t flow_meta_offset;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     unsigned int mprq_tstore_memcpy:1;
> > +#endif
> >  } __rte_cache_aligned;
> >
> >  enum mlx5_rxq_type {
> > diff --git a/meson_options.txt b/meson_options.txt index
> > 9bf18ab6b..a4bc565d2 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
> >       description: 'maximum number of cores/threads supported by EAL')
> > option('max_numa_nodes', type: 'integer', value: 4,
> >       description: 'maximum number of NUMA nodes supported by EAL')
> > +option('mlx5_ntload_tstore', type: 'boolean', value: false,
> > +     description: 'to enable optimized MPRQ in RX datapath')
> >  option('enable_trace_fp', type: 'boolean', value: false,
> >       description: 'enable fast path trace points.')  option('tests',
> type:
> > 'boolean', value: true,
> > --
> > 2.25.1
>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
@ 2020-10-10  8:57 Aman Kumar
  0 siblings, 0 replies; 13+ messages in thread
From: Aman Kumar @ 2020-10-10  8:57 UTC (permalink / raw)
  To: dev
  Cc: rasland, keesang.song, asafp, shys, viacheslavo, akozyrev, matan,
	aman.kumar

add non temporal load and temporal store for mprq memcpy.
define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
configuration to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build |   1 +
 drivers/net/mlx5/mlx5.c      |  12 ++++
 drivers/net/mlx5/mlx5.h      |   3 +
 drivers/net/mlx5/mlx5_rxq.c  |   3 +
 drivers/net/mlx5/mlx5_rxtx.c | 116 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |   3 +
 meson_options.txt            |   2 +
 7 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 9a97bb9c8..38e93fdc1 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -47,6 +47,7 @@ foreach option:cflags_options
 		cflags += option
 	endif
 endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 01ead6e6a..a2796eaa5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -160,6 +160,11 @@
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1623,6 +1628,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1683,6 +1692,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 43da9a1fb..1eb305650 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -234,6 +234,9 @@ struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index c059e216d..c8db59a12 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
 	mprq_stride_size = non_scatter_min_mbuf_size <=
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 0b87be15b..f59e30d82 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -123,6 +123,97 @@ uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
 uint64_t rte_net_mlx5_dynf_inline_mask;
 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+static void copy16B_ts(void *dst, void *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+static void copy32B_ts(void *dst, void *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static void copy64B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+}
+
+static void copy128B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 64));
+	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 96));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3);
+}
+
+static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		copy128B_ts(dst, src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		copy64B_ts(dst, src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		copy32B_ts(dst, src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		copy16B_ts(dst, src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -1707,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		uintptr_t data_addr;
+#endif
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
@@ -1772,12 +1866,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
 		 * - The packet's stride overlaps a headroom and scatter is off.
 		 */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		if (unlikely(!rxq->mprq_tstore_memcpy) &&
+			len <= rxq->mprq_max_memcpy_len) {
+			rte_prefetch1(addr);
+			if (len > RTE_CACHE_LINE_SIZE)
+				rte_prefetch2((void *)((uintptr_t)addr + RTE_CACHE_LINE_SIZE));
+		}
+#endif
 		if (len <= rxq->mprq_max_memcpy_len ||
 		    rxq->mprq_repl == NULL ||
 		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+				if (!(rxq->mprq_tstore_memcpy))
+					rte_memcpy((void *)data_addr, addr, len);
+				else if ((rxq->mprq_tstore_memcpy) &&
+					   !((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK))
+					memcpy_aligned_rx_tstore_16B((void *)data_addr,
+							addr, len);
+				else
+#endif
+					rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+							addr, len);
 				DATA_LEN(pkt) = len;
 			} else if (rxq->strd_scatter_en) {
 				struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 9ffa028d2..a8ea1a795 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -153,6 +153,9 @@ struct mlx5_rxq_data {
 	uint32_t tunnel; /* Tunnel information. */
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
diff --git a/meson_options.txt b/meson_options.txt
index 9bf18ab6b..a4bc565d2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
 	description: 'maximum number of cores/threads supported by EAL')
 option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false,
+	description: 'to enable optimized MPRQ in RX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-02-09  6:22 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-07 19:32 [dpdk-dev] [PATCH 1/3] net/mlx5: optimize mprq memcpy Aman Kumar
2020-09-07 19:32 ` [dpdk-dev] [PATCH 2/3] net/mlx5: add non temporal store for WQE fields Aman Kumar
2020-09-07 19:32 ` [dpdk-dev] [PATCH 3/3] config: added build config file for AMD EPYC platform Aman Kumar
2020-09-08  9:11   ` David Marchand
2020-09-25  3:16 ` [dpdk-dev] [PATCH v2 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
2020-09-25  3:16   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
2020-10-10  9:00   ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
2020-10-10  9:00     ` [dpdk-dev] [PATCH v3 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
2020-10-29  7:59     ` [dpdk-dev] [PATCH v4 1/2] net/mlx5: optimize mprq memcpy Aman Kumar
2020-10-29  7:59       ` [dpdk-dev] [PATCH v4 2/2] net/mlx5: add non temporal store for WQE fields Aman Kumar
2021-02-04 14:14     ` [dpdk-dev] [PATCH v3 1/2] net/mlx5: optimize mprq memcpy Slava Ovsiienko
2021-02-09  6:22       ` Aman Kumar
2020-10-10  8:57 Aman Kumar

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git