DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration
@ 2018-11-15 10:29 Yongseok Koh
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Yongseok Koh

There's some performance drop due to extra condition checks on the
datapath. Checking for external memory registration should be consolidated
to the existing bottom-half.

Fixes: 31912d992403 ("net/mlx4: support externally allocated static memory")

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 drivers/net/mlx4/mlx4_mr.c   | 28 +++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h | 26 ++++++--------------------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 726788a60d..a0094483ab 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -1042,7 +1042,7 @@ mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
  * @return
  *   Searched LKey on success, UINT32_MAX on no match.
  */
-uint32_t
+static uint32_t
 mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
 {
 	struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
@@ -1054,6 +1054,32 @@ mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
 }
 
 /**
+ * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
+ * list, register the mempool of the mbuf as externally allocated memory.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param mb
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb)
+{
+	uintptr_t addr = (uintptr_t)mb->buf_addr;
+	uint32_t lkey;
+
+	lkey = mlx4_tx_addr2mr_bh(txq, addr);
+	if (lkey == UINT32_MAX && rte_errno == ENXIO) {
+		/* Mempool may have externally allocated memory. */
+		return mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb));
+	}
+	return lkey;
+}
+
+/**
  * Flush all of the local cache entries.
  *
  * @param mr_ctrl
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 1be060cda1..d7ec4e0c5f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -162,7 +162,7 @@ void mlx4_tx_queue_release(void *dpdk_txq);
 
 void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl);
 uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr);
-uint32_t mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr);
+uint32_t mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb);
 uint32_t mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr,
 			       struct rte_mempool *mp);
 
@@ -176,7 +176,7 @@ uint32_t mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr,
  * @return
  *   Memory pool where data is located for given mbuf.
  */
-static struct rte_mempool *
+static inline struct rte_mempool *
 mlx4_mb2mp(struct rte_mbuf *buf)
 {
 	if (unlikely(RTE_MBUF_INDIRECT(buf)))
@@ -225,9 +225,10 @@ mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr)
  *   Searched LKey on success, UINT32_MAX on no match.
  */
 static __rte_always_inline uint32_t
-mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
+mlx4_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb)
 {
 	struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	uintptr_t addr = (uintptr_t)mb->buf_addr;
 	uint32_t lkey;
 
 	/* Check generation bit to see if there's any change on existing MRs. */
@@ -238,23 +239,8 @@ mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
 				    MLX4_MR_CACHE_N, addr);
 	if (likely(lkey != UINT32_MAX))
 		return lkey;
-	/* Take slower bottom-half (binary search) on miss. */
-	return mlx4_tx_addr2mr_bh(txq, addr);
-}
-
-static __rte_always_inline uint32_t
-mlx4_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb)
-{
-	uintptr_t addr = (uintptr_t)mb->buf_addr;
-	uint32_t lkey = mlx4_tx_addr2mr(txq, addr);
-
-	if (likely(lkey != UINT32_MAX))
-		return lkey;
-	if (rte_errno == ENXIO) {
-		/* Mempool may have externally allocated memory. */
-		lkey = mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb));
-	}
-	return lkey;
+	/* Take slower bottom-half on miss. */
+	return mlx4_tx_mb2mr_bh(txq, mb);
 }
 
 #endif /* MLX4_RXTX_H_ */
-- 
2.11.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH 2/4] net/mlx5: optimize Tx external memory registration
  2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
@ 2018-11-15 10:29 ` Yongseok Koh
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write Yongseok Koh
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Yongseok Koh

There's some performance drop due to extra condition checks on the
datapath. Checking for external memory registration should be consolidated
to the existing bottom-half.

Fixes: 7e43a32ee060 ("net/mlx5: support externally allocated static memory")

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 drivers/net/mlx5/mlx5_mr.c   | 28 +++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h | 26 ++++++--------------------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 6d7653d7d6..442b2d2321 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -1045,7 +1045,7 @@ mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
  * @return
  *   Searched LKey on success, UINT32_MAX on no match.
  */
-uint32_t
+static uint32_t
 mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
 {
 	struct mlx5_txq_ctrl *txq_ctrl =
@@ -1060,6 +1060,32 @@ mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
 }
 
 /**
+ * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
+ * list, register the mempool of the mbuf as externally allocated memory.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param mb
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
+{
+	uintptr_t addr = (uintptr_t)mb->buf_addr;
+	uint32_t lkey;
+
+	lkey = mlx5_tx_addr2mr_bh(txq, addr);
+	if (lkey == UINT32_MAX && rte_errno == ENXIO) {
+		/* Mempool may have externally allocated memory. */
+		return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
+	}
+	return lkey;
+}
+
+/**
  * Flush all of the local cache entries.
  *
  * @param mr_ctrl
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1b6200f6c8..59fb43fefe 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -363,7 +363,7 @@ uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 
 void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl);
 uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
-uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr);
+uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
 uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
 			       struct rte_mempool *mp);
 
@@ -619,7 +619,7 @@ mlx5_tx_complete(struct mlx5_txq_data *txq)
  * @return
  *   Memory pool where data is located for given mbuf.
  */
-static struct rte_mempool *
+static inline struct rte_mempool *
 mlx5_mb2mp(struct rte_mbuf *buf)
 {
 	if (unlikely(RTE_MBUF_INDIRECT(buf)))
@@ -668,9 +668,10 @@ mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr)
  *   Searched LKey on success, UINT32_MAX on no match.
  */
 static __rte_always_inline uint32_t
-mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr)
+mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
 {
 	struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+	uintptr_t addr = (uintptr_t)mb->buf_addr;
 	uint32_t lkey;
 
 	/* Check generation bit to see if there's any change on existing MRs. */
@@ -681,23 +682,8 @@ mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr)
 				    MLX5_MR_CACHE_N, addr);
 	if (likely(lkey != UINT32_MAX))
 		return lkey;
-	/* Take slower bottom-half (binary search) on miss. */
-	return mlx5_tx_addr2mr_bh(txq, addr);
-}
-
-static __rte_always_inline uint32_t
-mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
-{
-	uintptr_t addr = (uintptr_t)mb->buf_addr;
-	uint32_t lkey = mlx5_tx_addr2mr(txq, addr);
-
-	if (likely(lkey != UINT32_MAX))
-		return lkey;
-	if (rte_errno == ENXIO) {
-		/* Mempool may have externally allocated memory. */
-		lkey = mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
-	}
-	return lkey;
+	/* Take slower bottom-half on miss. */
+	return mlx5_tx_mb2mr_bh(txq, mb);
 }
 
 /**
-- 
2.11.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write
  2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
@ 2018-11-15 10:29 ` Yongseok Koh
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold Yongseok Koh
  2018-11-15 13:13 ` [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Shahaf Shuler
  3 siblings, 0 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Yongseok Koh, stable

Unnecessary volatile attribute keeps compiler from further optimizing the
code and this results in a little performance drop (~2%). Because of memory
barriers, it is safe to remove.

Fixes: 6bf10ab69be0 ("net/mlx5: support 32-bit systems")
Cc: stable@dpdk.org

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 59fb43fefe..e210453fe0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -379,17 +379,16 @@ uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
  *   Address of the lock to use for that UAR access.
  */
 static __rte_always_inline void
-__mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr,
+__mlx5_uar_write64_relaxed(uint64_t val, void *addr,
 			   rte_spinlock_t *lock __rte_unused)
 {
 #ifdef RTE_ARCH_64
-	rte_write64_relaxed(val, addr);
+	*(uint64_t *)addr = val;
 #else /* !RTE_ARCH_64 */
 	rte_spinlock_lock(lock);
-	rte_write32_relaxed(val, addr);
+	*(uint32_t *)addr = val;
 	rte_io_wmb();
-	rte_write32_relaxed(val >> 32,
-			    (volatile void *)((volatile char *)addr + 4));
+	*((uint32_t *)addr + 1) = val >> 32;
 	rte_spinlock_unlock(lock);
 #endif
 }
@@ -407,7 +406,7 @@ __mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr,
  *   Address of the lock to use for that UAR access.
  */
 static __rte_always_inline void
-__mlx5_uar_write64(uint64_t val, volatile void *addr, rte_spinlock_t *lock)
+__mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock)
 {
 	rte_io_wmb();
 	__mlx5_uar_write64_relaxed(val, addr, lock);
-- 
2.11.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold
  2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write Yongseok Koh
@ 2018-11-15 10:29 ` Yongseok Koh
  2018-11-15 13:13 ` [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Shahaf Shuler
  3 siblings, 0 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
  To: Shahaf Shuler; +Cc: dev, Yongseok Koh, stable

Due to redundant calculation per every burst, performance drops a little.

Fixes: e10245a13b2e ("net/mlx5: fix Rx buffer replenishment threshold")
Cc: stable@dpdk.org

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxq.c           | 2 ++
 drivers/net/mlx5/mlx5_rxtx.h          | 1 +
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 2 +-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  | 2 +-
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b27fc4798d..183da0e282 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1468,6 +1468,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->rxq.mp = mp;
 	tmpl->rxq.stats.idx = idx;
 	tmpl->rxq.elts_n = log2above(desc);
+	tmpl->rxq.rq_repl_thresh =
+		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
 	tmpl->rxq.elts =
 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
 #ifndef RTE_ARCH_64
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index e210453fe0..f47d327cfb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -101,6 +101,7 @@ struct mlx5_rxq_data {
 	uint16_t consumed_strd; /* Number of consumed strides in WQE. */
 	uint32_t rq_pi;
 	uint32_t cq_ci;
+	uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
 	struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
 	uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
 	volatile void *wqes;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 0b729f1859..883fe1bf91 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -732,7 +732,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	 *   N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished).
 	 */
 	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
-	if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n))
+	if (repl_n >= rxq->rq_repl_thresh)
 		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
 	/* See if there're unreturned mbufs from compressed CQE. */
 	rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index e0f95f923d..14117c4bb4 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -716,7 +716,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	 *   N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished).
 	 */
 	repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
-	if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n))
+	if (repl_n >= rxq->rq_repl_thresh)
 		mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
 	/* See if there're unreturned mbufs from compressed CQE. */
 	rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration
  2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
                   ` (2 preceding siblings ...)
  2018-11-15 10:29 ` [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold Yongseok Koh
@ 2018-11-15 13:13 ` Shahaf Shuler
  3 siblings, 0 replies; 5+ messages in thread
From: Shahaf Shuler @ 2018-11-15 13:13 UTC (permalink / raw)
  To: Yongseok Koh; +Cc: dev

Thursday, November 15, 2018 12:29 PM, Yongseok Koh:
> Subject: [PATCH 1/4] net/mlx4: optimize Tx external memory registration
> 
> There's some performance drop due to extra condition checks on the
> datapath. Checking for external memory registration should be consolidated
> to the existing bottom-half.
> 
> Fixes: 31912d992403 ("net/mlx4: support externally allocated static
> memory")
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>

Series applied to next-net-mlx, thanks. 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-11-15 13:13 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold Yongseok Koh
2018-11-15 13:13 ` [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Shahaf Shuler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).