* [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration
@ 2018-11-15 10:29 Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
To: Shahaf Shuler; +Cc: dev, Yongseok Koh
There's some performance drop due to extra condition checks on the
datapath. Checking for external memory registration should be consolidated
to the existing bottom-half.
Fixes: 31912d992403 ("net/mlx4: support externally allocated static memory")
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
drivers/net/mlx4/mlx4_mr.c | 28 +++++++++++++++++++++++++++-
drivers/net/mlx4/mlx4_rxtx.h | 26 ++++++--------------------
2 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 726788a60d..a0094483ab 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -1042,7 +1042,7 @@ mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
* @return
* Searched LKey on success, UINT32_MAX on no match.
*/
-uint32_t
+static uint32_t
mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
{
struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
@@ -1054,6 +1054,32 @@ mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
}
/**
+ * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
+ * list, register the mempool of the mbuf as externally allocated memory.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param mb
+ * Pointer to mbuf.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb)
+{
+ uintptr_t addr = (uintptr_t)mb->buf_addr;
+ uint32_t lkey;
+
+ lkey = mlx4_tx_addr2mr_bh(txq, addr);
+ if (lkey == UINT32_MAX && rte_errno == ENXIO) {
+ /* Mempool may have externally allocated memory. */
+ return mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb));
+ }
+ return lkey;
+}
+
+/**
* Flush all of the local cache entries.
*
* @param mr_ctrl
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 1be060cda1..d7ec4e0c5f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -162,7 +162,7 @@ void mlx4_tx_queue_release(void *dpdk_txq);
void mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl);
uint32_t mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr);
-uint32_t mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr);
+uint32_t mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb);
uint32_t mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr,
struct rte_mempool *mp);
@@ -176,7 +176,7 @@ uint32_t mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr,
* @return
* Memory pool where data is located for given mbuf.
*/
-static struct rte_mempool *
+static inline struct rte_mempool *
mlx4_mb2mp(struct rte_mbuf *buf)
{
if (unlikely(RTE_MBUF_INDIRECT(buf)))
@@ -225,9 +225,10 @@ mlx4_rx_addr2mr(struct rxq *rxq, uintptr_t addr)
* Searched LKey on success, UINT32_MAX on no match.
*/
static __rte_always_inline uint32_t
-mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
+mlx4_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb)
{
struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ uintptr_t addr = (uintptr_t)mb->buf_addr;
uint32_t lkey;
/* Check generation bit to see if there's any change on existing MRs. */
@@ -238,23 +239,8 @@ mlx4_tx_addr2mr(struct txq *txq, uintptr_t addr)
MLX4_MR_CACHE_N, addr);
if (likely(lkey != UINT32_MAX))
return lkey;
- /* Take slower bottom-half (binary search) on miss. */
- return mlx4_tx_addr2mr_bh(txq, addr);
-}
-
-static __rte_always_inline uint32_t
-mlx4_tx_mb2mr(struct txq *txq, struct rte_mbuf *mb)
-{
- uintptr_t addr = (uintptr_t)mb->buf_addr;
- uint32_t lkey = mlx4_tx_addr2mr(txq, addr);
-
- if (likely(lkey != UINT32_MAX))
- return lkey;
- if (rte_errno == ENXIO) {
- /* Mempool may have externally allocated memory. */
- lkey = mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb));
- }
- return lkey;
+ /* Take slower bottom-half on miss. */
+ return mlx4_tx_mb2mr_bh(txq, mb);
}
#endif /* MLX4_RXTX_H_ */
--
2.11.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* [dpdk-dev] [PATCH 2/4] net/mlx5: optimize Tx external memory registration
2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
@ 2018-11-15 10:29 ` Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write Yongseok Koh
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
To: Shahaf Shuler; +Cc: dev, Yongseok Koh
There's some performance drop due to extra condition checks on the
datapath. Checking for external memory registration should be consolidated
to the existing bottom-half.
Fixes: 7e43a32ee060 ("net/mlx5: support externally allocated static memory")
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
drivers/net/mlx5/mlx5_mr.c | 28 +++++++++++++++++++++++++++-
drivers/net/mlx5/mlx5_rxtx.h | 26 ++++++--------------------
2 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 6d7653d7d6..442b2d2321 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -1045,7 +1045,7 @@ mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
* @return
* Searched LKey on success, UINT32_MAX on no match.
*/
-uint32_t
+static uint32_t
mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
{
struct mlx5_txq_ctrl *txq_ctrl =
@@ -1060,6 +1060,32 @@ mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
}
/**
+ * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
+ * list, register the mempool of the mbuf as externally allocated memory.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param mb
+ * Pointer to mbuf.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on no match.
+ */
+uint32_t
+mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
+{
+ uintptr_t addr = (uintptr_t)mb->buf_addr;
+ uint32_t lkey;
+
+ lkey = mlx5_tx_addr2mr_bh(txq, addr);
+ if (lkey == UINT32_MAX && rte_errno == ENXIO) {
+ /* Mempool may have externally allocated memory. */
+ return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
+ }
+ return lkey;
+}
+
+/**
* Flush all of the local cache entries.
*
* @param mr_ctrl
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1b6200f6c8..59fb43fefe 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -363,7 +363,7 @@ uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl);
uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
-uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr);
+uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
struct rte_mempool *mp);
@@ -619,7 +619,7 @@ mlx5_tx_complete(struct mlx5_txq_data *txq)
* @return
* Memory pool where data is located for given mbuf.
*/
-static struct rte_mempool *
+static inline struct rte_mempool *
mlx5_mb2mp(struct rte_mbuf *buf)
{
if (unlikely(RTE_MBUF_INDIRECT(buf)))
@@ -668,9 +668,10 @@ mlx5_rx_addr2mr(struct mlx5_rxq_data *rxq, uintptr_t addr)
* Searched LKey on success, UINT32_MAX on no match.
*/
static __rte_always_inline uint32_t
-mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr)
+mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
{
struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
+ uintptr_t addr = (uintptr_t)mb->buf_addr;
uint32_t lkey;
/* Check generation bit to see if there's any change on existing MRs. */
@@ -681,23 +682,8 @@ mlx5_tx_addr2mr(struct mlx5_txq_data *txq, uintptr_t addr)
MLX5_MR_CACHE_N, addr);
if (likely(lkey != UINT32_MAX))
return lkey;
- /* Take slower bottom-half (binary search) on miss. */
- return mlx5_tx_addr2mr_bh(txq, addr);
-}
-
-static __rte_always_inline uint32_t
-mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
-{
- uintptr_t addr = (uintptr_t)mb->buf_addr;
- uint32_t lkey = mlx5_tx_addr2mr(txq, addr);
-
- if (likely(lkey != UINT32_MAX))
- return lkey;
- if (rte_errno == ENXIO) {
- /* Mempool may have externally allocated memory. */
- lkey = mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
- }
- return lkey;
+ /* Take slower bottom-half on miss. */
+ return mlx5_tx_mb2mr_bh(txq, mb);
}
/**
--
2.11.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write
2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
@ 2018-11-15 10:29 ` Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold Yongseok Koh
2018-11-15 13:13 ` [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Shahaf Shuler
3 siblings, 0 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
To: Shahaf Shuler; +Cc: dev, Yongseok Koh, stable
Unnecessary volatile attribute keeps compiler from further optimizing the
code and this results in a little performance drop (~2%). Because of memory
barriers, it is safe to remove.
Fixes: 6bf10ab69be0 ("net/mlx5: support 32-bit systems")
Cc: stable@dpdk.org
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
drivers/net/mlx5/mlx5_rxtx.h | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 59fb43fefe..e210453fe0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -379,17 +379,16 @@ uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
* Address of the lock to use for that UAR access.
*/
static __rte_always_inline void
-__mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr,
+__mlx5_uar_write64_relaxed(uint64_t val, void *addr,
rte_spinlock_t *lock __rte_unused)
{
#ifdef RTE_ARCH_64
- rte_write64_relaxed(val, addr);
+ *(uint64_t *)addr = val;
#else /* !RTE_ARCH_64 */
rte_spinlock_lock(lock);
- rte_write32_relaxed(val, addr);
+ *(uint32_t *)addr = val;
rte_io_wmb();
- rte_write32_relaxed(val >> 32,
- (volatile void *)((volatile char *)addr + 4));
+ *((uint32_t *)addr + 1) = val >> 32;
rte_spinlock_unlock(lock);
#endif
}
@@ -407,7 +406,7 @@ __mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr,
* Address of the lock to use for that UAR access.
*/
static __rte_always_inline void
-__mlx5_uar_write64(uint64_t val, volatile void *addr, rte_spinlock_t *lock)
+__mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock)
{
rte_io_wmb();
__mlx5_uar_write64_relaxed(val, addr, lock);
--
2.11.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold
2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write Yongseok Koh
@ 2018-11-15 10:29 ` Yongseok Koh
2018-11-15 13:13 ` [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Shahaf Shuler
3 siblings, 0 replies; 5+ messages in thread
From: Yongseok Koh @ 2018-11-15 10:29 UTC (permalink / raw)
To: Shahaf Shuler; +Cc: dev, Yongseok Koh, stable
Due to redundant calculation per every burst, performance drops a little.
Fixes: e10245a13b2e ("net/mlx5: fix Rx buffer replenishment threshold")
Cc: stable@dpdk.org
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
drivers/net/mlx5/mlx5_rxq.c | 2 ++
drivers/net/mlx5/mlx5_rxtx.h | 1 +
drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 2 +-
drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 2 +-
4 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b27fc4798d..183da0e282 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1468,6 +1468,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->rxq.mp = mp;
tmpl->rxq.stats.idx = idx;
tmpl->rxq.elts_n = log2above(desc);
+ tmpl->rxq.rq_repl_thresh =
+ MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
tmpl->rxq.elts =
(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
#ifndef RTE_ARCH_64
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index e210453fe0..f47d327cfb 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -101,6 +101,7 @@ struct mlx5_rxq_data {
uint16_t consumed_strd; /* Number of consumed strides in WQE. */
uint32_t rq_pi;
uint32_t cq_ci;
+ uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
volatile void *wqes;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 0b729f1859..883fe1bf91 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -732,7 +732,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
* N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished).
*/
repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
- if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n))
+ if (repl_n >= rxq->rq_repl_thresh)
mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
/* See if there're unreturned mbufs from compressed CQE. */
rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index e0f95f923d..14117c4bb4 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -716,7 +716,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
* N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished).
*/
repl_n = q_n - (rxq->rq_ci - rxq->rq_pi);
- if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n))
+ if (repl_n >= rxq->rq_repl_thresh)
mlx5_rx_replenish_bulk_mbuf(rxq, repl_n);
/* See if there're unreturned mbufs from compressed CQE. */
rcvd_pkt = rxq->cq_ci - rxq->rq_pi;
--
2.11.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration
2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
` (2 preceding siblings ...)
2018-11-15 10:29 ` [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold Yongseok Koh
@ 2018-11-15 13:13 ` Shahaf Shuler
3 siblings, 0 replies; 5+ messages in thread
From: Shahaf Shuler @ 2018-11-15 13:13 UTC (permalink / raw)
To: Yongseok Koh; +Cc: dev
Thursday, November 15, 2018 12:29 PM, Yongseok Koh:
> Subject: [PATCH 1/4] net/mlx4: optimize Tx external memory registration
>
> There's some performance drop due to extra condition checks on the
> datapath. Checking for external memory registration should be consolidated
> to the existing bottom-half.
>
> Fixes: 31912d992403 ("net/mlx4: support externally allocated static
> memory")
>
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Series applied to next-net-mlx, thanks.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2018-11-15 13:13 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-15 10:29 [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 2/4] net/mlx5: " Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 3/4] net/mlx5: optimize Tx doorbell write Yongseok Koh
2018-11-15 10:29 ` [dpdk-dev] [PATCH 4/4] net/mlx5: optimize Rx buffer replenishment threshold Yongseok Koh
2018-11-15 13:13 ` [dpdk-dev] [PATCH 1/4] net/mlx4: optimize Tx external memory registration Shahaf Shuler
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).