DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5
@ 2016-01-29 10:32 Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 1/6] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
                   ` (6 more replies)
  0 siblings, 7 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

This patchset improves the mlx5 PMD performance by doing better prefetching,
by reordering internal structure fields and by removing a few unnecessary
operations.

Note: should be applied after "Add flow director and RX VLAN stripping
support" to avoid conflicts.

Nelio Laranjeiro (6):
  mlx5: prefetch next TX mbuf header and data
  mlx5: reorder TX/RX queue structure
  mlx5: remove one indirection level from RX/TX functions
  mlx5: process offload flags only when requested
  mlx5: avoid lkey retrieval for inlined packets
  mlx5: free buffers immediately after completion

 drivers/net/mlx5/Makefile    |   1 +
 drivers/net/mlx5/mlx5_rxq.c  |  12 ++++
 drivers/net/mlx5/mlx5_rxtx.c | 136 +++++++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_rxtx.h |  54 ++++++++++-------
 drivers/net/mlx5/mlx5_txq.c  |  14 +++++
 5 files changed, 132 insertions(+), 85 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH 1/6] mlx5: prefetch next TX mbuf header and data
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
@ 2016-01-29 10:32 ` Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 2/6] mlx5: reorder TX/RX queue structure Adrien Mazarguil
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

This change improves performance noticeably.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7585570..bee5ce2 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -443,8 +443,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i;
 	unsigned int max;
 	int err;
+	struct rte_mbuf *buf = pkts[0];
 
 	assert(elts_comp_cd != 0);
+	/* Prefetch first packet cacheline. */
+	rte_prefetch0(buf);
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
@@ -458,7 +461,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
+		struct rte_mbuf *buf_next = pkts[i + 1];
 		unsigned int elts_head_next =
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
@@ -481,6 +484,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				tmp = next;
 			} while (tmp != NULL);
 		}
+		if (i + 1 < max)
+			rte_prefetch0(buf_next);
 		/* Request TX completion. */
 		if (unlikely(--elts_comp_cd == 0)) {
 			elts_comp_cd = txq->elts_comp_cd_init;
@@ -502,6 +507,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			uintptr_t addr;
 			uint32_t length;
 			uint32_t lkey;
+			uintptr_t buf_next_addr;
 
 			/* Retrieve buffer information. */
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
@@ -522,6 +528,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				rte_prefetch0((volatile void *)
 					      (uintptr_t)addr);
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+			/* Prefetch next buffer data. */
+			if (i + 1 < max) {
+				buf_next_addr =
+					rte_pktmbuf_mtod(buf_next, uintptr_t);
+				rte_prefetch0((volatile void *)
+					      (uintptr_t)buf_next_addr);
+			}
 			/* Put packet into send queue. */
 #if MLX5_PMD_MAX_INLINE > 0
 			if (length <= txq->max_inline)
@@ -571,6 +584,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #endif /* MLX5_PMD_SGE_WR_N > 1 */
 		}
 		elts_head = elts_head_next;
+		buf = buf_next;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += sent_size;
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH 2/6] mlx5: reorder TX/RX queue structure
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 1/6] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
@ 2016-01-29 10:32 ` Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 3/6] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Remove padding and move important fields to the beginning for better
performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.h | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index fde0ca2..4a857d8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -105,7 +105,6 @@ struct priv;
 struct rxq {
 	struct priv *priv; /* Back pointer to private data. */
 	struct rte_mempool *mp; /* Memory Pool for allocations. */
-	struct ibv_mr *mr; /* Memory Region (for mp). */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
 	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
@@ -117,19 +116,20 @@ struct rxq {
 	unsigned int port_id; /* Port ID for incoming packets. */
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
-	union {
-		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
-		struct rxq_elt (*no_sp)[]; /* RX elements. */
-	} elts;
 	unsigned int sp:1; /* Use scattered RX elements. */
 	unsigned int csum:1; /* Enable checksum offloading. */
 	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
+	union {
+		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
+		struct rxq_elt (*no_sp)[]; /* RX elements. */
+	} elts;
 	uint32_t mb_len; /* Length of a mp-issued mbuf. */
-	struct mlx5_rxq_stats stats; /* RX queue counters. */
 	unsigned int socket; /* CPU socket ID for allocations. */
+	struct mlx5_rxq_stats stats; /* RX queue counters. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
+	struct ibv_mr *mr; /* Memory Region (for mp). */
 };
 
 /* Hash RX queue types. */
@@ -248,30 +248,31 @@ typedef uint8_t linear_t[16384];
 /* TX queue descriptor. */
 struct txq {
 	struct priv *priv; /* Back pointer to private data. */
-	struct {
-		const struct rte_mempool *mp; /* Cached Memory Pool. */
-		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* mr->lkey */
-	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
-	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+	struct txq_elt (*elts)[]; /* TX elements. */
 #if MLX5_PMD_MAX_INLINE > 0
 	uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
 #endif
 	unsigned int elts_n; /* (*elts)[] length. */
-	struct txq_elt (*elts)[]; /* TX elements. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
 	unsigned int elts_tail; /* First element awaiting completion. */
 	unsigned int elts_comp; /* Number of completion requests. */
 	unsigned int elts_comp_cd; /* Countdown for next completion request. */
 	unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+	struct {
+		const struct rte_mempool *mp; /* Cached Memory Pool. */
+		struct ibv_mr *mr; /* Memory Region (for mp). */
+		uint32_t lkey; /* mr->lkey */
+	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+	/* Elements used only for init part are here. */
 	linear_t (*elts_linear)[]; /* Linearized buffers. */
 	struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
-	unsigned int socket; /* CPU socket ID for allocations. */
+	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
+	unsigned int socket; /* CPU socket ID for allocations. */
 };
 
 /* mlx5_rxq.c */
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH 3/6] mlx5: remove one indirection level from RX/TX functions
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 1/6] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 2/6] mlx5: reorder TX/RX queue structure Adrien Mazarguil
@ 2016-01-29 10:32 ` Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 4/6] mlx5: process offload flags only when requested Adrien Mazarguil
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Avoid dereferencing pointers twice to get to fast Verbs functions by storing
them directly in RX/TX queue structures.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/Makefile    |  1 +
 drivers/net/mlx5/mlx5_rxq.c  | 12 ++++++++++++
 drivers/net/mlx5/mlx5_rxtx.c | 34 +++++++++-------------------------
 drivers/net/mlx5/mlx5_rxtx.h | 23 +++++++++++++++++------
 drivers/net/mlx5/mlx5_txq.c  | 14 ++++++++++++++
 5 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 46a17e0..39cdf2c 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -67,6 +67,7 @@ CFLAGS += -g
 CFLAGS += -I.
 CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -Wno-strict-prototypes
 LDLIBS += -libverbs
 
 # A few warnings cannot be avoided in external headers.
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index c79ce5c..537737f 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -901,6 +901,8 @@ rxq_cleanup(struct rxq *rxq)
 		rxq_free_elts_sp(rxq);
 	else
 		rxq_free_elts(rxq);
+	rxq->poll = NULL;
+	rxq->recv = NULL;
 	if (rxq->if_wq != NULL) {
 		assert(rxq->priv != NULL);
 		assert(rxq->priv->ctx != NULL);
@@ -1343,6 +1345,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	*rxq = tmpl;
 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
 	assert(ret == 0);
+	/* Assign function in queue. */
+#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
+	rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
+#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	rxq->poll = rxq->if_cq->poll_length_flags;
+#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	if (rxq->sp)
+		rxq->recv = rxq->if_wq->recv_sg_list;
+	else
+		rxq->recv = rxq->if_wq->recv_burst;
 	return 0;
 error:
 	rxq_cleanup(&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index bee5ce2..63ddc53 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -93,7 +93,7 @@ txq_complete(struct txq *txq)
 	DEBUG("%p: processing %u work requests completions",
 	      (void *)txq, elts_comp);
 #endif
-	wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
+	wcs_n = txq->poll_cnt(txq->cq, elts_comp);
 	if (unlikely(wcs_n == 0))
 		return 0;
 	if (unlikely(wcs_n < 0)) {
@@ -538,14 +538,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			/* Put packet into send queue. */
 #if MLX5_PMD_MAX_INLINE > 0
 			if (length <= txq->max_inline)
-				err = txq->if_qp->send_pending_inline
+				err = txq->send_pending_inline
 					(txq->qp,
 					 (void *)addr,
 					 length,
 					 send_flags);
 			else
 #endif
-				err = txq->if_qp->send_pending
+				err = txq->send_pending
 					(txq->qp,
 					 addr,
 					 length,
@@ -567,7 +567,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				goto stop;
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Put SG list into send queue. */
-			err = txq->if_qp->send_pending_sg_list
+			err = txq->send_pending_sg_list
 				(txq->qp,
 				 sges,
 				 ret.num,
@@ -599,7 +599,7 @@ stop:
 	txq->stats.opackets += i;
 #endif
 	/* Ring QP doorbell. */
-	err = txq->if_qp->send_flush(txq->qp);
+	err = txq->send_flush(txq->qp);
 	if (unlikely(err)) {
 		/* A nonzero value is not supposed to be returned.
 		 * Nothing can be done about it. */
@@ -733,14 +733,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Sanity checks. */
 		assert(elts_head < rxq->elts_n);
 		assert(rxq->elts_head < rxq->elts_n);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL,
-							  &flags, &vlan_tci);
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		(void)vlan_tci;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
 			int wcs_n;
@@ -877,9 +870,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		rxq->stats.ibytes += pkt_buf_len;
 #endif
 repost:
-		ret = rxq->if_wq->recv_sg_list(rxq->wq,
-					       elt->sges,
-					       RTE_DIM(elt->sges));
+		ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
 		if (unlikely(ret)) {
 			/* Inability to repost WRs is fatal. */
 			DEBUG("%p: recv_sg_list(): failed (ret=%d)",
@@ -950,14 +941,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 */
 		rte_prefetch0(seg);
 		rte_prefetch0(&seg->cacheline1);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL,
-							  &flags, &vlan_tci);
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		(void)vlan_tci;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
 			int wcs_n;
@@ -1049,7 +1033,7 @@ repost:
 #ifdef DEBUG_RECV
 	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
 #endif
-	ret = rxq->if_wq->recv_burst(rxq->wq, sges, i);
+	ret = rxq->recv(rxq->wq, sges, i);
 	if (unlikely(ret)) {
 		/* Inability to repost WRs is fatal. */
 		DEBUG("%p: recv_burst(): failed (ret=%d)",
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 4a857d8..b239ebf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -107,12 +107,8 @@ struct rxq {
 	struct rte_mempool *mp; /* Memory Pool for allocations. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
-	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-	struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	int32_t (*poll)(); /* Verbs poll function. */
+	int32_t (*recv)(); /* Verbs receive function. */
 	unsigned int port_id; /* Port ID for incoming packets. */
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
@@ -130,6 +126,12 @@ struct rxq {
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
 	struct ibv_mr *mr; /* Memory Region (for mp). */
+	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
+#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
+	struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
+#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
 };
 
 /* Hash RX queue types. */
@@ -248,6 +250,15 @@ typedef uint8_t linear_t[16384];
 /* TX queue descriptor. */
 struct txq {
 	struct priv *priv; /* Back pointer to private data. */
+	int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
+	int (*send_pending)();
+#if MLX5_PMD_MAX_INLINE > 0
+	int (*send_pending_inline)();
+#endif
+#if MLX5_PMD_SGE_WR_N > 1
+	int (*send_pending_sg_list)();
+#endif
+	int (*send_flush)(struct ibv_qp *qp);
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
 	struct txq_elt (*elts)[]; /* TX elements. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 214a7c1..3364fca 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -187,6 +187,11 @@ txq_cleanup(struct txq *txq)
 
 	DEBUG("cleaning up %p", (void *)txq);
 	txq_free_elts(txq);
+	txq->poll_cnt = NULL;
+#if MLX5_PMD_MAX_INLINE > 0
+	txq->send_pending_inline = NULL;
+#endif
+	txq->send_flush = NULL;
 	if (txq->if_qp != NULL) {
 		assert(txq->priv != NULL);
 		assert(txq->priv->ctx != NULL);
@@ -414,6 +419,15 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
 	DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
 	txq_cleanup(txq);
 	*txq = tmpl;
+	txq->poll_cnt = txq->if_cq->poll_cnt;
+#if MLX5_PMD_MAX_INLINE > 0
+	txq->send_pending_inline = txq->if_qp->send_pending_inline;
+#endif
+#if MLX5_PMD_SGE_WR_N > 1
+	txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
+#endif
+	txq->send_pending = txq->if_qp->send_pending;
+	txq->send_flush = txq->if_qp->send_flush;
 	DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
 	/* Pre-register known mempools. */
 	rte_mempool_walk(txq_mp2mr_iter, txq);
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH 4/6] mlx5: process offload flags only when requested
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
                   ` (2 preceding siblings ...)
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 3/6] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
@ 2016-01-29 10:32 ` Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 5/6] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Improve performance by processing offloads only when requested by the
application.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 63ddc53..c84ec8c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -853,14 +853,16 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NB_SEGS(pkt_buf) = j;
 		PORT(pkt_buf) = rxq->port_id;
 		PKT_LEN(pkt_buf) = pkt_buf_len;
-		pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
-		pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
+			pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
+			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-			pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
-			pkt_buf->vlan_tci = vlan_tci;
-		}
+			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
+				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt_buf->vlan_tci = vlan_tci;
+			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		}
 
 		/* Return packet. */
 		*(pkts++) = pkt_buf;
@@ -1006,15 +1008,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NEXT(seg) = NULL;
 		PKT_LEN(seg) = len;
 		DATA_LEN(seg) = len;
-		seg->packet_type = rxq_cq_to_pkt_type(flags);
-		seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
+			seg->packet_type = rxq_cq_to_pkt_type(flags);
+			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-			seg->ol_flags |= PKT_RX_VLAN_PKT;
-			seg->vlan_tci = vlan_tci;
-		}
+			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
+				seg->ol_flags |= PKT_RX_VLAN_PKT;
+				seg->vlan_tci = vlan_tci;
+			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-
+		}
 		/* Return packet. */
 		*(pkts++) = seg;
 		++pkts_ret;
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH 5/6] mlx5: avoid lkey retrieval for inlined packets
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
                   ` (3 preceding siblings ...)
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 4/6] mlx5: process offload flags only when requested Adrien Mazarguil
@ 2016-01-29 10:32 ` Adrien Mazarguil
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 6/6] mlx5: free buffers immediately after completion Adrien Mazarguil
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  6 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Improves performance as the lkey is not needed by hardware in this case.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c84ec8c..579efa0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -512,16 +512,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			/* Retrieve buffer information. */
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			length = DATA_LEN(buf);
-			/* Retrieve Memory Region key for this memory pool. */
-			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			if (unlikely(lkey == (uint32_t)-1)) {
-				/* MR does not exist. */
-				DEBUG("%p: unable to get MP <-> MR"
-				      " association", (void *)txq);
-				/* Clean up TX element. */
-				elt->buf = NULL;
-				goto stop;
-			}
 			/* Update element. */
 			elt->buf = buf;
 			if (txq->priv->vf)
@@ -545,12 +535,24 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					 send_flags);
 			else
 #endif
+			{
+				/* Retrieve Memory Region key for this memory pool. */
+				lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+				if (unlikely(lkey == (uint32_t)-1)) {
+					/* MR does not exist. */
+					DEBUG("%p: unable to get MP <-> MR"
+					      " association", (void *)txq);
+					/* Clean up TX element. */
+					elt->buf = NULL;
+					goto stop;
+				}
 				err = txq->send_pending
 					(txq->qp,
 					 addr,
 					 length,
 					 lkey,
 					 send_flags);
+			}
 			if (unlikely(err))
 				goto stop;
 #ifdef MLX5_PMD_SOFT_COUNTERS
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH 6/6] mlx5: free buffers immediately after completion
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
                   ` (4 preceding siblings ...)
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 5/6] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
@ 2016-01-29 10:32 ` Adrien Mazarguil
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  6 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-01-29 10:32 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

This lowers the amount of cache misses.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 579efa0..36abeef 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -84,6 +84,7 @@ txq_complete(struct txq *txq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
+	unsigned int elts_free = txq->elts_tail;
 	const unsigned int elts_n = txq->elts_n;
 	int wcs_n;
 
@@ -110,6 +111,25 @@ txq_complete(struct txq *txq)
 	elts_tail += wcs_n * txq->elts_comp_cd_init;
 	if (elts_tail >= elts_n)
 		elts_tail -= elts_n;
+
+	while (elts_free != elts_tail) {
+		struct txq_elt *elt = &(*txq->elts)[elts_free];
+		unsigned int elts_free_next =
+			(((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
+		struct rte_mbuf *tmp = elt->buf;
+		struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+
+		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+		/* Faster than rte_pktmbuf_free(). */
+		do {
+			struct rte_mbuf *next = NEXT(tmp);
+
+			rte_pktmbuf_free_seg(tmp);
+			tmp = next;
+		} while (tmp != NULL);
+		elts_free = elts_free_next;
+	}
+
 	txq->elts_tail = elts_tail;
 	txq->elts_comp = elts_comp;
 	return 0;
@@ -464,7 +484,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		struct rte_mbuf *buf_next = pkts[i + 1];
 		unsigned int elts_head_next =
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
 		unsigned int segs = NB_SEGS(buf);
 #ifdef MLX5_PMD_SOFT_COUNTERS
@@ -472,18 +491,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #endif
 		uint32_t send_flags = 0;
 
-		/* Clean up old buffer. */
-		if (likely(elt->buf != NULL)) {
-			struct rte_mbuf *tmp = elt->buf;
-
-			/* Faster than rte_pktmbuf_free(). */
-			do {
-				struct rte_mbuf *next = NEXT(tmp);
-
-				rte_pktmbuf_free_seg(tmp);
-				tmp = next;
-			} while (tmp != NULL);
-		}
 		if (i + 1 < max)
 			rte_prefetch0(buf_next);
 		/* Request TX completion. */
@@ -517,7 +524,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (txq->priv->vf)
 				rte_prefetch0((volatile void *)
 					      (uintptr_t)addr);
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Prefetch next buffer data. */
 			if (i + 1 < max) {
 				buf_next_addr =
@@ -567,7 +573,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					  &sges);
 			if (ret.length == (unsigned int)-1)
 				goto stop;
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Put SG list into send queue. */
 			err = txq->send_pending_sg_list
 				(txq->qp,
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4
  2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
                   ` (5 preceding siblings ...)
  2016-01-29 10:32 ` [dpdk-dev] [PATCH 6/6] mlx5: free buffers immediately after completion Adrien Mazarguil
@ 2016-02-22 18:17 ` Adrien Mazarguil
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
                     ` (7 more replies)
  6 siblings, 8 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:17 UTC (permalink / raw)
  To: dev

This patchset improves the mlx5 PMD performance by doing better prefetching,
by reordering internal structure fields and by removing a few unnecessary
operations.

Note: should be applied after "Add flow director and RX VLAN stripping
support" to avoid conflicts.

Changes in v2:
- Rebased patchset on top of dpdk-next-net/rel_16_04.
- Fixed missing update for receive function in rxq_rehash().
- Added a commit to register memory on page boundaries instead of mempool
  object boundaries for better performance (mlx4 and mlx5).

Adrien Mazarguil (1):
  mlx: use aligned memory to register regions

Nelio Laranjeiro (6):
  mlx5: prefetch next TX mbuf header and data
  mlx5: reorder TX/RX queue structure
  mlx5: remove one indirection level from RX/TX functions
  mlx5: process offload flags only when requested
  mlx5: avoid lkey retrieval for inlined packets
  mlx5: free buffers immediately after completion

 drivers/net/mlx4/mlx4.c      |  58 ++++++++++---
 drivers/net/mlx5/Makefile    |   1 +
 drivers/net/mlx5/mlx5_rxq.c  |  22 +++--
 drivers/net/mlx5/mlx5_rxtx.c | 189 +++++++++++++++++++++++++++----------------
 drivers/net/mlx5/mlx5_rxtx.h |  55 ++++++++-----
 drivers/net/mlx5/mlx5_txq.c  |  14 ++++
 6 files changed, 236 insertions(+), 103 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 1/7] mlx5: prefetch next TX mbuf header and data
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
@ 2016-02-22 18:17   ` Adrien Mazarguil
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 2/7] mlx5: reorder TX/RX queue structure Adrien Mazarguil
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:17 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

This change improves performance noticeably.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7585570..bee5ce2 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -443,8 +443,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i;
 	unsigned int max;
 	int err;
+	struct rte_mbuf *buf = pkts[0];
 
 	assert(elts_comp_cd != 0);
+	/* Prefetch first packet cacheline. */
+	rte_prefetch0(buf);
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
@@ -458,7 +461,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
+		struct rte_mbuf *buf_next = pkts[i + 1];
 		unsigned int elts_head_next =
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
@@ -481,6 +484,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				tmp = next;
 			} while (tmp != NULL);
 		}
+		if (i + 1 < max)
+			rte_prefetch0(buf_next);
 		/* Request TX completion. */
 		if (unlikely(--elts_comp_cd == 0)) {
 			elts_comp_cd = txq->elts_comp_cd_init;
@@ -502,6 +507,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			uintptr_t addr;
 			uint32_t length;
 			uint32_t lkey;
+			uintptr_t buf_next_addr;
 
 			/* Retrieve buffer information. */
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
@@ -522,6 +528,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				rte_prefetch0((volatile void *)
 					      (uintptr_t)addr);
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+			/* Prefetch next buffer data. */
+			if (i + 1 < max) {
+				buf_next_addr =
+					rte_pktmbuf_mtod(buf_next, uintptr_t);
+				rte_prefetch0((volatile void *)
+					      (uintptr_t)buf_next_addr);
+			}
 			/* Put packet into send queue. */
 #if MLX5_PMD_MAX_INLINE > 0
 			if (length <= txq->max_inline)
@@ -571,6 +584,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #endif /* MLX5_PMD_SGE_WR_N > 1 */
 		}
 		elts_head = elts_head_next;
+		buf = buf_next;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += sent_size;
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 2/7] mlx5: reorder TX/RX queue structure
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
@ 2016-02-22 18:17   ` Adrien Mazarguil
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 3/7] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:17 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Remove padding and move important fields to the beginning for better
performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.h | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index fde0ca2..4a857d8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -105,7 +105,6 @@ struct priv;
 struct rxq {
 	struct priv *priv; /* Back pointer to private data. */
 	struct rte_mempool *mp; /* Memory Pool for allocations. */
-	struct ibv_mr *mr; /* Memory Region (for mp). */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
 	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
@@ -117,19 +116,20 @@ struct rxq {
 	unsigned int port_id; /* Port ID for incoming packets. */
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
-	union {
-		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
-		struct rxq_elt (*no_sp)[]; /* RX elements. */
-	} elts;
 	unsigned int sp:1; /* Use scattered RX elements. */
 	unsigned int csum:1; /* Enable checksum offloading. */
 	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
+	union {
+		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
+		struct rxq_elt (*no_sp)[]; /* RX elements. */
+	} elts;
 	uint32_t mb_len; /* Length of a mp-issued mbuf. */
-	struct mlx5_rxq_stats stats; /* RX queue counters. */
 	unsigned int socket; /* CPU socket ID for allocations. */
+	struct mlx5_rxq_stats stats; /* RX queue counters. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
+	struct ibv_mr *mr; /* Memory Region (for mp). */
 };
 
 /* Hash RX queue types. */
@@ -248,30 +248,31 @@ typedef uint8_t linear_t[16384];
 /* TX queue descriptor. */
 struct txq {
 	struct priv *priv; /* Back pointer to private data. */
-	struct {
-		const struct rte_mempool *mp; /* Cached Memory Pool. */
-		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* mr->lkey */
-	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
-	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+	struct txq_elt (*elts)[]; /* TX elements. */
 #if MLX5_PMD_MAX_INLINE > 0
 	uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
 #endif
 	unsigned int elts_n; /* (*elts)[] length. */
-	struct txq_elt (*elts)[]; /* TX elements. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
 	unsigned int elts_tail; /* First element awaiting completion. */
 	unsigned int elts_comp; /* Number of completion requests. */
 	unsigned int elts_comp_cd; /* Countdown for next completion request. */
 	unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+	struct {
+		const struct rte_mempool *mp; /* Cached Memory Pool. */
+		struct ibv_mr *mr; /* Memory Region (for mp). */
+		uint32_t lkey; /* mr->lkey */
+	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+	/* Elements used only for init part are here. */
 	linear_t (*elts_linear)[]; /* Linearized buffers. */
 	struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
-	unsigned int socket; /* CPU socket ID for allocations. */
+	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
+	unsigned int socket; /* CPU socket ID for allocations. */
 };
 
 /* mlx5_rxq.c */
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 3/7] mlx5: remove one indirection level from RX/TX functions
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 2/7] mlx5: reorder TX/RX queue structure Adrien Mazarguil
@ 2016-02-22 18:17   ` Adrien Mazarguil
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 4/7] mlx5: process offload flags only when requested Adrien Mazarguil
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:17 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Avoid dereferencing pointers twice to get to fast Verbs functions by
storing them directly in RX/TX queue structures.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Yaacov Hazan <yaacovh@mellanox.com>
---
 drivers/net/mlx5/Makefile    |  1 +
 drivers/net/mlx5/mlx5_rxq.c  | 16 ++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.c | 34 +++++++++-------------------------
 drivers/net/mlx5/mlx5_rxtx.h | 23 +++++++++++++++++------
 drivers/net/mlx5/mlx5_txq.c  | 14 ++++++++++++++
 5 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 46a17e0..39cdf2c 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -67,6 +67,7 @@ CFLAGS += -g
 CFLAGS += -I.
 CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -Wno-strict-prototypes
 LDLIBS += -libverbs
 
 # A few warnings cannot be avoided in external headers.
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 573ad8f..55d002e 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -901,6 +901,8 @@ rxq_cleanup(struct rxq *rxq)
 		rxq_free_elts_sp(rxq);
 	else
 		rxq_free_elts(rxq);
+	rxq->poll = NULL;
+	rxq->recv = NULL;
 	if (rxq->if_wq != NULL) {
 		assert(rxq->priv != NULL);
 		assert(rxq->priv->ctx != NULL);
@@ -1103,6 +1105,10 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
 		err = EIO;
 		goto error;
 	}
+	if (tmpl.sp)
+		tmpl.recv = tmpl.if_wq->recv_sg_list;
+	else
+		tmpl.recv = tmpl.if_wq->recv_burst;
 error:
 	*rxq = tmpl;
 	assert(err >= 0);
@@ -1345,6 +1351,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	*rxq = tmpl;
 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
 	assert(ret == 0);
+	/* Assign function in queue. */
+#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
+	rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
+#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	rxq->poll = rxq->if_cq->poll_length_flags;
+#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	if (rxq->sp)
+		rxq->recv = rxq->if_wq->recv_sg_list;
+	else
+		rxq->recv = rxq->if_wq->recv_burst;
 	return 0;
 error:
 	rxq_cleanup(&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index bee5ce2..63ddc53 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -93,7 +93,7 @@ txq_complete(struct txq *txq)
 	DEBUG("%p: processing %u work requests completions",
 	      (void *)txq, elts_comp);
 #endif
-	wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
+	wcs_n = txq->poll_cnt(txq->cq, elts_comp);
 	if (unlikely(wcs_n == 0))
 		return 0;
 	if (unlikely(wcs_n < 0)) {
@@ -538,14 +538,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			/* Put packet into send queue. */
 #if MLX5_PMD_MAX_INLINE > 0
 			if (length <= txq->max_inline)
-				err = txq->if_qp->send_pending_inline
+				err = txq->send_pending_inline
 					(txq->qp,
 					 (void *)addr,
 					 length,
 					 send_flags);
 			else
 #endif
-				err = txq->if_qp->send_pending
+				err = txq->send_pending
 					(txq->qp,
 					 addr,
 					 length,
@@ -567,7 +567,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				goto stop;
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Put SG list into send queue. */
-			err = txq->if_qp->send_pending_sg_list
+			err = txq->send_pending_sg_list
 				(txq->qp,
 				 sges,
 				 ret.num,
@@ -599,7 +599,7 @@ stop:
 	txq->stats.opackets += i;
 #endif
 	/* Ring QP doorbell. */
-	err = txq->if_qp->send_flush(txq->qp);
+	err = txq->send_flush(txq->qp);
 	if (unlikely(err)) {
 		/* A nonzero value is not supposed to be returned.
 		 * Nothing can be done about it. */
@@ -733,14 +733,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Sanity checks. */
 		assert(elts_head < rxq->elts_n);
 		assert(rxq->elts_head < rxq->elts_n);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL,
-							  &flags, &vlan_tci);
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		(void)vlan_tci;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
 			int wcs_n;
@@ -877,9 +870,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		rxq->stats.ibytes += pkt_buf_len;
 #endif
 repost:
-		ret = rxq->if_wq->recv_sg_list(rxq->wq,
-					       elt->sges,
-					       RTE_DIM(elt->sges));
+		ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
 		if (unlikely(ret)) {
 			/* Inability to repost WRs is fatal. */
 			DEBUG("%p: recv_sg_list(): failed (ret=%d)",
@@ -950,14 +941,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 */
 		rte_prefetch0(seg);
 		rte_prefetch0(&seg->cacheline1);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL,
-							  &flags, &vlan_tci);
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		(void)vlan_tci;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
 			int wcs_n;
@@ -1049,7 +1033,7 @@ repost:
 #ifdef DEBUG_RECV
 	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
 #endif
-	ret = rxq->if_wq->recv_burst(rxq->wq, sges, i);
+	ret = rxq->recv(rxq->wq, sges, i);
 	if (unlikely(ret)) {
 		/* Inability to repost WRs is fatal. */
 		DEBUG("%p: recv_burst(): failed (ret=%d)",
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 4a857d8..b239ebf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -107,12 +107,8 @@ struct rxq {
 	struct rte_mempool *mp; /* Memory Pool for allocations. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
-	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-	struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	int32_t (*poll)(); /* Verbs poll function. */
+	int32_t (*recv)(); /* Verbs receive function. */
 	unsigned int port_id; /* Port ID for incoming packets. */
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
@@ -130,6 +126,12 @@ struct rxq {
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
 	struct ibv_mr *mr; /* Memory Region (for mp). */
+	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
+#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
+	struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
+#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
 };
 
 /* Hash RX queue types. */
@@ -248,6 +250,15 @@ typedef uint8_t linear_t[16384];
 /* TX queue descriptor. */
 struct txq {
 	struct priv *priv; /* Back pointer to private data. */
+	int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
+	int (*send_pending)();
+#if MLX5_PMD_MAX_INLINE > 0
+	int (*send_pending_inline)();
+#endif
+#if MLX5_PMD_SGE_WR_N > 1
+	int (*send_pending_sg_list)();
+#endif
+	int (*send_flush)(struct ibv_qp *qp);
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
 	struct txq_elt (*elts)[]; /* TX elements. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 214a7c1..3364fca 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -187,6 +187,11 @@ txq_cleanup(struct txq *txq)
 
 	DEBUG("cleaning up %p", (void *)txq);
 	txq_free_elts(txq);
+	txq->poll_cnt = NULL;
+#if MLX5_PMD_MAX_INLINE > 0
+	txq->send_pending_inline = NULL;
+#endif
+	txq->send_flush = NULL;
 	if (txq->if_qp != NULL) {
 		assert(txq->priv != NULL);
 		assert(txq->priv->ctx != NULL);
@@ -414,6 +419,15 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
 	DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
 	txq_cleanup(txq);
 	*txq = tmpl;
+	txq->poll_cnt = txq->if_cq->poll_cnt;
+#if MLX5_PMD_MAX_INLINE > 0
+	txq->send_pending_inline = txq->if_qp->send_pending_inline;
+#endif
+#if MLX5_PMD_SGE_WR_N > 1
+	txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
+#endif
+	txq->send_pending = txq->if_qp->send_pending;
+	txq->send_flush = txq->if_qp->send_flush;
 	DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
 	/* Pre-register known mempools. */
 	rte_mempool_walk(txq_mp2mr_iter, txq);
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 4/7] mlx5: process offload flags only when requested
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                     ` (2 preceding siblings ...)
  2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 3/7] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
@ 2016-02-22 18:18   ` Adrien Mazarguil
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 5/7] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:18 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Improve performance by processing offloads only when requested by the
application.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 63ddc53..c84ec8c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -853,14 +853,16 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NB_SEGS(pkt_buf) = j;
 		PORT(pkt_buf) = rxq->port_id;
 		PKT_LEN(pkt_buf) = pkt_buf_len;
-		pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
-		pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
+			pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
+			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-			pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
-			pkt_buf->vlan_tci = vlan_tci;
-		}
+			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
+				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt_buf->vlan_tci = vlan_tci;
+			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		}
 
 		/* Return packet. */
 		*(pkts++) = pkt_buf;
@@ -1006,15 +1008,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NEXT(seg) = NULL;
 		PKT_LEN(seg) = len;
 		DATA_LEN(seg) = len;
-		seg->packet_type = rxq_cq_to_pkt_type(flags);
-		seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
+			seg->packet_type = rxq_cq_to_pkt_type(flags);
+			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-			seg->ol_flags |= PKT_RX_VLAN_PKT;
-			seg->vlan_tci = vlan_tci;
-		}
+			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
+				seg->ol_flags |= PKT_RX_VLAN_PKT;
+				seg->vlan_tci = vlan_tci;
+			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-
+		}
 		/* Return packet. */
 		*(pkts++) = seg;
 		++pkts_ret;
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 5/7] mlx5: avoid lkey retrieval for inlined packets
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                     ` (3 preceding siblings ...)
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 4/7] mlx5: process offload flags only when requested Adrien Mazarguil
@ 2016-02-22 18:18   ` Adrien Mazarguil
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 6/7] mlx5: free buffers immediately after completion Adrien Mazarguil
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:18 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Improves performance as the lkey is not needed by hardware in this case.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c84ec8c..b82017e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -512,16 +512,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			/* Retrieve buffer information. */
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			length = DATA_LEN(buf);
-			/* Retrieve Memory Region key for this memory pool. */
-			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			if (unlikely(lkey == (uint32_t)-1)) {
-				/* MR does not exist. */
-				DEBUG("%p: unable to get MP <-> MR"
-				      " association", (void *)txq);
-				/* Clean up TX element. */
-				elt->buf = NULL;
-				goto stop;
-			}
 			/* Update element. */
 			elt->buf = buf;
 			if (txq->priv->vf)
@@ -545,12 +535,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					 send_flags);
 			else
 #endif
+			{
+				/* Retrieve Memory Region key for this
+				 * memory pool. */
+				lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+				if (unlikely(lkey == (uint32_t)-1)) {
+					/* MR does not exist. */
+					DEBUG("%p: unable to get MP <-> MR"
+					      " association", (void *)txq);
+					/* Clean up TX element. */
+					elt->buf = NULL;
+					goto stop;
+				}
 				err = txq->send_pending
 					(txq->qp,
 					 addr,
 					 length,
 					 lkey,
 					 send_flags);
+			}
 			if (unlikely(err))
 				goto stop;
 #ifdef MLX5_PMD_SOFT_COUNTERS
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 6/7] mlx5: free buffers immediately after completion
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                     ` (4 preceding siblings ...)
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 5/7] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
@ 2016-02-22 18:18   ` Adrien Mazarguil
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 7/7] mlx: use aligned memory to register regions Adrien Mazarguil
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:18 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

This lowers the amount of cache misses.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b82017e..622ac17 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -84,6 +84,7 @@ txq_complete(struct txq *txq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
+	unsigned int elts_free = txq->elts_tail;
 	const unsigned int elts_n = txq->elts_n;
 	int wcs_n;
 
@@ -110,6 +111,25 @@ txq_complete(struct txq *txq)
 	elts_tail += wcs_n * txq->elts_comp_cd_init;
 	if (elts_tail >= elts_n)
 		elts_tail -= elts_n;
+
+	while (elts_free != elts_tail) {
+		struct txq_elt *elt = &(*txq->elts)[elts_free];
+		unsigned int elts_free_next =
+			(((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
+		struct rte_mbuf *tmp = elt->buf;
+		struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+
+		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+		/* Faster than rte_pktmbuf_free(). */
+		do {
+			struct rte_mbuf *next = NEXT(tmp);
+
+			rte_pktmbuf_free_seg(tmp);
+			tmp = next;
+		} while (tmp != NULL);
+		elts_free = elts_free_next;
+	}
+
 	txq->elts_tail = elts_tail;
 	txq->elts_comp = elts_comp;
 	return 0;
@@ -464,7 +484,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		struct rte_mbuf *buf_next = pkts[i + 1];
 		unsigned int elts_head_next =
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
 		unsigned int segs = NB_SEGS(buf);
 #ifdef MLX5_PMD_SOFT_COUNTERS
@@ -472,18 +491,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #endif
 		uint32_t send_flags = 0;
 
-		/* Clean up old buffer. */
-		if (likely(elt->buf != NULL)) {
-			struct rte_mbuf *tmp = elt->buf;
-
-			/* Faster than rte_pktmbuf_free(). */
-			do {
-				struct rte_mbuf *next = NEXT(tmp);
-
-				rte_pktmbuf_free_seg(tmp);
-				tmp = next;
-			} while (tmp != NULL);
-		}
 		if (i + 1 < max)
 			rte_prefetch0(buf_next);
 		/* Request TX completion. */
@@ -517,7 +524,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (txq->priv->vf)
 				rte_prefetch0((volatile void *)
 					      (uintptr_t)addr);
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Prefetch next buffer data. */
 			if (i + 1 < max) {
 				buf_next_addr =
@@ -568,7 +574,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					  &sges);
 			if (ret.length == (unsigned int)-1)
 				goto stop;
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Put SG list into send queue. */
 			err = txq->send_pending_sg_list
 				(txq->qp,
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v2 7/7] mlx: use aligned memory to register regions
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                     ` (5 preceding siblings ...)
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 6/7] mlx5: free buffers immediately after completion Adrien Mazarguil
@ 2016-02-22 18:18   ` Adrien Mazarguil
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-02-22 18:18 UTC (permalink / raw)
  To: dev

The first and last memory pool elements are usually cache-aligned but not
page-aligned, particularly when using huge pages.

Hardware performance can be improved significantly by registering memory
regions starting and ending on page boundaries.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4.c      | 58 +++++++++++++++++++++++++++++++++++++-------
 drivers/net/mlx5/mlx5_rxq.c  |  6 +----
 drivers/net/mlx5/mlx5_rxtx.c | 52 ++++++++++++++++++++++++++++++++++++---
 drivers/net/mlx5/mlx5_rxtx.h |  1 +
 4 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 6688f66..3c1f4c2 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -86,6 +86,7 @@
 #include <rte_version.h>
 #include <rte_log.h>
 #include <rte_alarm.h>
+#include <rte_memory.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -1177,6 +1178,52 @@ txq_complete(struct txq *txq)
 	return 0;
 }
 
+/* For best performance, this function should not be inlined. */
+static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, const struct rte_mempool *)
+	__attribute__((noinline));
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ *   Pointer to protection domain.
+ * @param mp
+ *   Pointer to memory pool.
+ *
+ * @return
+ *   Memory region pointer, NULL in case of error.
+ */
+static struct ibv_mr *
+mlx4_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start = mp->elt_va_start;
+	uintptr_t end = mp->elt_va_end;
+	unsigned int i;
+
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
+
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	return ibv_reg_mr(pd,
+			  (void *)start,
+			  end - start,
+			  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+}
+
 /**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
  * the cloned mbuf is allocated is returned instead.
@@ -1228,10 +1275,7 @@ txq_mp2mr(struct txq *txq, const struct rte_mempool *mp)
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (const void *)mp);
-	mr = ibv_reg_mr(txq->priv->pd,
-			(void *)mp->elt_va_start,
-			(mp->elt_va_end - mp->elt_va_start),
-			(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+	mr = mlx4_mp2mr(txq->priv->pd, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq);
@@ -3713,11 +3757,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	DEBUG("%p: %s scattered packets support (%u WRs)",
 	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
 	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = ibv_reg_mr(priv->pd,
-			     (void *)mp->elt_va_start,
-			     (mp->elt_va_end - mp->elt_va_start),
-			     (IBV_ACCESS_LOCAL_WRITE |
-			      IBV_ACCESS_REMOTE_WRITE));
+	tmpl.mr = mlx4_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
 		ret = EINVAL;
 		ERROR("%p: MR creation failure: %s",
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 55d002e..0f5ac65 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1190,11 +1190,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	DEBUG("%p: %s scattered packets support (%u WRs)",
 	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
 	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = ibv_reg_mr(priv->pd,
-			     (void *)mp->elt_va_start,
-			     (mp->elt_va_end - mp->elt_va_start),
-			     (IBV_ACCESS_LOCAL_WRITE |
-			      IBV_ACCESS_REMOTE_WRITE));
+	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
 		ret = EINVAL;
 		ERROR("%p: MR creation failure: %s",
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 622ac17..4c53c7a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -55,6 +55,7 @@
 #include <rte_prefetch.h>
 #include <rte_common.h>
 #include <rte_branch_prediction.h>
+#include <rte_memory.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -135,6 +136,52 @@ txq_complete(struct txq *txq)
 	return 0;
 }
 
+/* For best performance, this function should not be inlined. */
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *)
+	__attribute__((noinline));
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ *   Pointer to protection domain.
+ * @param mp
+ *   Pointer to memory pool.
+ *
+ * @return
+ *   Memory region pointer, NULL in case of error.
+ */
+struct ibv_mr *
+mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start = mp->elt_va_start;
+	uintptr_t end = mp->elt_va_end;
+	unsigned int i;
+
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
+
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	return ibv_reg_mr(pd,
+			  (void *)start,
+			  end - start,
+			  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+}
+
 /**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
  * the cloned mbuf is allocated is returned instead.
@@ -186,10 +233,7 @@ txq_mp2mr(struct txq *txq, const struct rte_mempool *mp)
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (const void *)mp);
-	mr = ibv_reg_mr(txq->priv->pd,
-			(void *)mp->elt_va_start,
-			(mp->elt_va_end - mp->elt_va_start),
-			(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+	mr = mlx5_mp2mr(txq->priv->pd, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq);
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index b239ebf..e85cf93 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -317,6 +317,7 @@ void mlx5_tx_queue_release(void *);
 
 /* mlx5_rxtx.c */
 
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *);
 void txq_mp2mr_iter(const struct rte_mempool *, void *);
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4
  2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                     ` (6 preceding siblings ...)
  2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 7/7] mlx: use aligned memory to register regions Adrien Mazarguil
@ 2016-03-03 14:27   ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
                       ` (7 more replies)
  7 siblings, 8 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

This patchset improves the mlx5 PMD performance by doing better prefetching,
by reordering internal structure fields and by removing a few unnecessary
operations.

Note: should be applied after "Add flow director and RX VLAN stripping
support" to avoid conflicts.

Changes in v3:
- None, submitted again due to dependency with previous patchset.

Changes in v2:
- Rebased patchset on top of dpdk-next-net/rel_16_04.
- Fixed missing update for receive function in rxq_rehash().
- Added a commit to register memory on page boundaries instead of mempool
  object boundaries for better performance (mlx4 and mlx5).

Adrien Mazarguil (1):
  mlx: use aligned memory to register regions

Nelio Laranjeiro (6):
  mlx5: prefetch next TX mbuf header and data
  mlx5: reorder TX/RX queue structure
  mlx5: remove one indirection level from RX/TX functions
  mlx5: process offload flags only when requested
  mlx5: avoid lkey retrieval for inlined packets
  mlx5: free buffers immediately after completion

 drivers/net/mlx4/mlx4.c      |  58 ++++++++++---
 drivers/net/mlx5/Makefile    |   1 +
 drivers/net/mlx5/mlx5_rxq.c  |  22 +++--
 drivers/net/mlx5/mlx5_rxtx.c | 189 +++++++++++++++++++++++++++----------------
 drivers/net/mlx5/mlx5_rxtx.h |  55 ++++++++-----
 drivers/net/mlx5/mlx5_txq.c  |  14 ++++
 6 files changed, 236 insertions(+), 103 deletions(-)

-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 1/7] mlx5: prefetch next TX mbuf header and data
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 2/7] mlx5: reorder TX/RX queue structure Adrien Mazarguil
                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

This change improves performance noticeably.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7585570..bee5ce2 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -443,8 +443,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i;
 	unsigned int max;
 	int err;
+	struct rte_mbuf *buf = pkts[0];
 
 	assert(elts_comp_cd != 0);
+	/* Prefetch first packet cacheline. */
+	rte_prefetch0(buf);
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
@@ -458,7 +461,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
+		struct rte_mbuf *buf_next = pkts[i + 1];
 		unsigned int elts_head_next =
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
 		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
@@ -481,6 +484,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				tmp = next;
 			} while (tmp != NULL);
 		}
+		if (i + 1 < max)
+			rte_prefetch0(buf_next);
 		/* Request TX completion. */
 		if (unlikely(--elts_comp_cd == 0)) {
 			elts_comp_cd = txq->elts_comp_cd_init;
@@ -502,6 +507,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			uintptr_t addr;
 			uint32_t length;
 			uint32_t lkey;
+			uintptr_t buf_next_addr;
 
 			/* Retrieve buffer information. */
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
@@ -522,6 +528,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				rte_prefetch0((volatile void *)
 					      (uintptr_t)addr);
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+			/* Prefetch next buffer data. */
+			if (i + 1 < max) {
+				buf_next_addr =
+					rte_pktmbuf_mtod(buf_next, uintptr_t);
+				rte_prefetch0((volatile void *)
+					      (uintptr_t)buf_next_addr);
+			}
 			/* Put packet into send queue. */
 #if MLX5_PMD_MAX_INLINE > 0
 			if (length <= txq->max_inline)
@@ -571,6 +584,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #endif /* MLX5_PMD_SGE_WR_N > 1 */
 		}
 		elts_head = elts_head_next;
+		buf = buf_next;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += sent_size;
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 2/7] mlx5: reorder TX/RX queue structure
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 3/7] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
                       ` (5 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Remove padding and move important fields to the beginning for better
performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.h | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index fde0ca2..4a857d8 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -105,7 +105,6 @@ struct priv;
 struct rxq {
 	struct priv *priv; /* Back pointer to private data. */
 	struct rte_mempool *mp; /* Memory Pool for allocations. */
-	struct ibv_mr *mr; /* Memory Region (for mp). */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
 	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
@@ -117,19 +116,20 @@ struct rxq {
 	unsigned int port_id; /* Port ID for incoming packets. */
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
-	union {
-		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
-		struct rxq_elt (*no_sp)[]; /* RX elements. */
-	} elts;
 	unsigned int sp:1; /* Use scattered RX elements. */
 	unsigned int csum:1; /* Enable checksum offloading. */
 	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
+	union {
+		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
+		struct rxq_elt (*no_sp)[]; /* RX elements. */
+	} elts;
 	uint32_t mb_len; /* Length of a mp-issued mbuf. */
-	struct mlx5_rxq_stats stats; /* RX queue counters. */
 	unsigned int socket; /* CPU socket ID for allocations. */
+	struct mlx5_rxq_stats stats; /* RX queue counters. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
+	struct ibv_mr *mr; /* Memory Region (for mp). */
 };
 
 /* Hash RX queue types. */
@@ -248,30 +248,31 @@ typedef uint8_t linear_t[16384];
 /* TX queue descriptor. */
 struct txq {
 	struct priv *priv; /* Back pointer to private data. */
-	struct {
-		const struct rte_mempool *mp; /* Cached Memory Pool. */
-		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* mr->lkey */
-	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
-	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+	struct txq_elt (*elts)[]; /* TX elements. */
 #if MLX5_PMD_MAX_INLINE > 0
 	uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
 #endif
 	unsigned int elts_n; /* (*elts)[] length. */
-	struct txq_elt (*elts)[]; /* TX elements. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
 	unsigned int elts_tail; /* First element awaiting completion. */
 	unsigned int elts_comp; /* Number of completion requests. */
 	unsigned int elts_comp_cd; /* Countdown for next completion request. */
 	unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+	struct {
+		const struct rte_mempool *mp; /* Cached Memory Pool. */
+		struct ibv_mr *mr; /* Memory Region (for mp). */
+		uint32_t lkey; /* mr->lkey */
+	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+	/* Elements used only for init part are here. */
 	linear_t (*elts_linear)[]; /* Linearized buffers. */
 	struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
-	unsigned int socket; /* CPU socket ID for allocations. */
+	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
+	unsigned int socket; /* CPU socket ID for allocations. */
 };
 
 /* mlx5_rxq.c */
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 3/7] mlx5: remove one indirection level from RX/TX functions
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 2/7] mlx5: reorder TX/RX queue structure Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 4/7] mlx5: process offload flags only when requested Adrien Mazarguil
                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Avoid dereferencing pointers twice to get to fast Verbs functions by
storing them directly in RX/TX queue structures.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Yaacov Hazan <yaacovh@mellanox.com>
---
 drivers/net/mlx5/Makefile    |  1 +
 drivers/net/mlx5/mlx5_rxq.c  | 16 ++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.c | 34 +++++++++-------------------------
 drivers/net/mlx5/mlx5_rxtx.h | 23 +++++++++++++++++------
 drivers/net/mlx5/mlx5_txq.c  | 14 ++++++++++++++
 5 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 46a17e0..39cdf2c 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -67,6 +67,7 @@ CFLAGS += -g
 CFLAGS += -I.
 CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -Wno-strict-prototypes
 LDLIBS += -libverbs
 
 # A few warnings cannot be avoided in external headers.
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 573ad8f..55d002e 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -901,6 +901,8 @@ rxq_cleanup(struct rxq *rxq)
 		rxq_free_elts_sp(rxq);
 	else
 		rxq_free_elts(rxq);
+	rxq->poll = NULL;
+	rxq->recv = NULL;
 	if (rxq->if_wq != NULL) {
 		assert(rxq->priv != NULL);
 		assert(rxq->priv->ctx != NULL);
@@ -1103,6 +1105,10 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
 		err = EIO;
 		goto error;
 	}
+	if (tmpl.sp)
+		tmpl.recv = tmpl.if_wq->recv_sg_list;
+	else
+		tmpl.recv = tmpl.if_wq->recv_burst;
 error:
 	*rxq = tmpl;
 	assert(err >= 0);
@@ -1345,6 +1351,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	*rxq = tmpl;
 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
 	assert(ret == 0);
+	/* Assign function in queue. */
+#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
+	rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
+#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	rxq->poll = rxq->if_cq->poll_length_flags;
+#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	if (rxq->sp)
+		rxq->recv = rxq->if_wq->recv_sg_list;
+	else
+		rxq->recv = rxq->if_wq->recv_burst;
 	return 0;
 error:
 	rxq_cleanup(&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index bee5ce2..63ddc53 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -93,7 +93,7 @@ txq_complete(struct txq *txq)
 	DEBUG("%p: processing %u work requests completions",
 	      (void *)txq, elts_comp);
 #endif
-	wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
+	wcs_n = txq->poll_cnt(txq->cq, elts_comp);
 	if (unlikely(wcs_n == 0))
 		return 0;
 	if (unlikely(wcs_n < 0)) {
@@ -538,14 +538,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			/* Put packet into send queue. */
 #if MLX5_PMD_MAX_INLINE > 0
 			if (length <= txq->max_inline)
-				err = txq->if_qp->send_pending_inline
+				err = txq->send_pending_inline
 					(txq->qp,
 					 (void *)addr,
 					 length,
 					 send_flags);
 			else
 #endif
-				err = txq->if_qp->send_pending
+				err = txq->send_pending
 					(txq->qp,
 					 addr,
 					 length,
@@ -567,7 +567,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				goto stop;
 			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Put SG list into send queue. */
-			err = txq->if_qp->send_pending_sg_list
+			err = txq->send_pending_sg_list
 				(txq->qp,
 				 sges,
 				 ret.num,
@@ -599,7 +599,7 @@ stop:
 	txq->stats.opackets += i;
 #endif
 	/* Ring QP doorbell. */
-	err = txq->if_qp->send_flush(txq->qp);
+	err = txq->send_flush(txq->qp);
 	if (unlikely(err)) {
 		/* A nonzero value is not supposed to be returned.
 		 * Nothing can be done about it. */
@@ -733,14 +733,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Sanity checks. */
 		assert(elts_head < rxq->elts_n);
 		assert(rxq->elts_head < rxq->elts_n);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL,
-							  &flags, &vlan_tci);
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		(void)vlan_tci;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
 			int wcs_n;
@@ -877,9 +870,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		rxq->stats.ibytes += pkt_buf_len;
 #endif
 repost:
-		ret = rxq->if_wq->recv_sg_list(rxq->wq,
-					       elt->sges,
-					       RTE_DIM(elt->sges));
+		ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
 		if (unlikely(ret)) {
 			/* Inability to repost WRs is fatal. */
 			DEBUG("%p: recv_sg_list(): failed (ret=%d)",
@@ -950,14 +941,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 */
 		rte_prefetch0(seg);
 		rte_prefetch0(&seg->cacheline1);
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL,
-							  &flags, &vlan_tci);
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
-						    &flags);
-		(void)vlan_tci;
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
 		if (unlikely(ret < 0)) {
 			struct ibv_wc wc;
 			int wcs_n;
@@ -1049,7 +1033,7 @@ repost:
 #ifdef DEBUG_RECV
 	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
 #endif
-	ret = rxq->if_wq->recv_burst(rxq->wq, sges, i);
+	ret = rxq->recv(rxq->wq, sges, i);
 	if (unlikely(ret)) {
 		/* Inability to repost WRs is fatal. */
 		DEBUG("%p: recv_burst(): failed (ret=%d)",
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 4a857d8..b239ebf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -107,12 +107,8 @@ struct rxq {
 	struct rte_mempool *mp; /* Memory Pool for allocations. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
-	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
-#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-	struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
-#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
-#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	int32_t (*poll)(); /* Verbs poll function. */
+	int32_t (*recv)(); /* Verbs receive function. */
 	unsigned int port_id; /* Port ID for incoming packets. */
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
@@ -130,6 +126,12 @@ struct rxq {
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
 	struct ibv_mr *mr; /* Memory Region (for mp). */
+	struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */
+#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
+	struct ibv_exp_cq_family_v1 *if_cq; /* CQ interface. */
+#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
 };
 
 /* Hash RX queue types. */
@@ -248,6 +250,15 @@ typedef uint8_t linear_t[16384];
 /* TX queue descriptor. */
 struct txq {
 	struct priv *priv; /* Back pointer to private data. */
+	int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
+	int (*send_pending)();
+#if MLX5_PMD_MAX_INLINE > 0
+	int (*send_pending_inline)();
+#endif
+#if MLX5_PMD_SGE_WR_N > 1
+	int (*send_pending_sg_list)();
+#endif
+	int (*send_flush)(struct ibv_qp *qp);
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
 	struct txq_elt (*elts)[]; /* TX elements. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 214a7c1..3364fca 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -187,6 +187,11 @@ txq_cleanup(struct txq *txq)
 
 	DEBUG("cleaning up %p", (void *)txq);
 	txq_free_elts(txq);
+	txq->poll_cnt = NULL;
+#if MLX5_PMD_MAX_INLINE > 0
+	txq->send_pending_inline = NULL;
+#endif
+	txq->send_flush = NULL;
 	if (txq->if_qp != NULL) {
 		assert(txq->priv != NULL);
 		assert(txq->priv->ctx != NULL);
@@ -414,6 +419,15 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
 	DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
 	txq_cleanup(txq);
 	*txq = tmpl;
+	txq->poll_cnt = txq->if_cq->poll_cnt;
+#if MLX5_PMD_MAX_INLINE > 0
+	txq->send_pending_inline = txq->if_qp->send_pending_inline;
+#endif
+#if MLX5_PMD_SGE_WR_N > 1
+	txq->send_pending_sg_list = txq->if_qp->send_pending_sg_list;
+#endif
+	txq->send_pending = txq->if_qp->send_pending;
+	txq->send_flush = txq->if_qp->send_flush;
 	DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
 	/* Pre-register known mempools. */
 	rte_mempool_walk(txq_mp2mr_iter, txq);
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 4/7] mlx5: process offload flags only when requested
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                       ` (2 preceding siblings ...)
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 3/7] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 5/7] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
                       ` (3 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Improve performance by processing offloads only when requested by the
application.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 63ddc53..c84ec8c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -853,14 +853,16 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NB_SEGS(pkt_buf) = j;
 		PORT(pkt_buf) = rxq->port_id;
 		PKT_LEN(pkt_buf) = pkt_buf_len;
-		pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
-		pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
+			pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
+			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-			pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
-			pkt_buf->vlan_tci = vlan_tci;
-		}
+			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
+				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt_buf->vlan_tci = vlan_tci;
+			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
+		}
 
 		/* Return packet. */
 		*(pkts++) = pkt_buf;
@@ -1006,15 +1008,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NEXT(seg) = NULL;
 		PKT_LEN(seg) = len;
 		DATA_LEN(seg) = len;
-		seg->packet_type = rxq_cq_to_pkt_type(flags);
-		seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
+			seg->packet_type = rxq_cq_to_pkt_type(flags);
+			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
-		if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-			seg->ol_flags |= PKT_RX_VLAN_PKT;
-			seg->vlan_tci = vlan_tci;
-		}
+			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
+				seg->ol_flags |= PKT_RX_VLAN_PKT;
+				seg->vlan_tci = vlan_tci;
+			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
-
+		}
 		/* Return packet. */
 		*(pkts++) = seg;
 		++pkts_ret;
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 5/7] mlx5: avoid lkey retrieval for inlined packets
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                       ` (3 preceding siblings ...)
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 4/7] mlx5: process offload flags only when requested Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 6/7] mlx5: free buffers immediately after completion Adrien Mazarguil
                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Improves performance as the lkey is not needed by hardware in this case.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c84ec8c..b82017e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -512,16 +512,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			/* Retrieve buffer information. */
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			length = DATA_LEN(buf);
-			/* Retrieve Memory Region key for this memory pool. */
-			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-			if (unlikely(lkey == (uint32_t)-1)) {
-				/* MR does not exist. */
-				DEBUG("%p: unable to get MP <-> MR"
-				      " association", (void *)txq);
-				/* Clean up TX element. */
-				elt->buf = NULL;
-				goto stop;
-			}
 			/* Update element. */
 			elt->buf = buf;
 			if (txq->priv->vf)
@@ -545,12 +535,25 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					 send_flags);
 			else
 #endif
+			{
+				/* Retrieve Memory Region key for this
+				 * memory pool. */
+				lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+				if (unlikely(lkey == (uint32_t)-1)) {
+					/* MR does not exist. */
+					DEBUG("%p: unable to get MP <-> MR"
+					      " association", (void *)txq);
+					/* Clean up TX element. */
+					elt->buf = NULL;
+					goto stop;
+				}
 				err = txq->send_pending
 					(txq->qp,
 					 addr,
 					 length,
 					 lkey,
 					 send_flags);
+			}
 			if (unlikely(err))
 				goto stop;
 #ifdef MLX5_PMD_SOFT_COUNTERS
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 6/7] mlx5: free buffers immediately after completion
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                       ` (4 preceding siblings ...)
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 5/7] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 7/7] mlx: use aligned memory to register regions Adrien Mazarguil
  2016-03-09 16:28     ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Bruce Richardson
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

This lowers the amount of cache misses.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b82017e..622ac17 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -84,6 +84,7 @@ txq_complete(struct txq *txq)
 {
 	unsigned int elts_comp = txq->elts_comp;
 	unsigned int elts_tail = txq->elts_tail;
+	unsigned int elts_free = txq->elts_tail;
 	const unsigned int elts_n = txq->elts_n;
 	int wcs_n;
 
@@ -110,6 +111,25 @@ txq_complete(struct txq *txq)
 	elts_tail += wcs_n * txq->elts_comp_cd_init;
 	if (elts_tail >= elts_n)
 		elts_tail -= elts_n;
+
+	while (elts_free != elts_tail) {
+		struct txq_elt *elt = &(*txq->elts)[elts_free];
+		unsigned int elts_free_next =
+			(((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
+		struct rte_mbuf *tmp = elt->buf;
+		struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+
+		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+		/* Faster than rte_pktmbuf_free(). */
+		do {
+			struct rte_mbuf *next = NEXT(tmp);
+
+			rte_pktmbuf_free_seg(tmp);
+			tmp = next;
+		} while (tmp != NULL);
+		elts_free = elts_free_next;
+	}
+
 	txq->elts_tail = elts_tail;
 	txq->elts_comp = elts_comp;
 	return 0;
@@ -464,7 +484,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		struct rte_mbuf *buf_next = pkts[i + 1];
 		unsigned int elts_head_next =
 			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-		struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
 		struct txq_elt *elt = &(*txq->elts)[elts_head];
 		unsigned int segs = NB_SEGS(buf);
 #ifdef MLX5_PMD_SOFT_COUNTERS
@@ -472,18 +491,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 #endif
 		uint32_t send_flags = 0;
 
-		/* Clean up old buffer. */
-		if (likely(elt->buf != NULL)) {
-			struct rte_mbuf *tmp = elt->buf;
-
-			/* Faster than rte_pktmbuf_free(). */
-			do {
-				struct rte_mbuf *next = NEXT(tmp);
-
-				rte_pktmbuf_free_seg(tmp);
-				tmp = next;
-			} while (tmp != NULL);
-		}
 		if (i + 1 < max)
 			rte_prefetch0(buf_next);
 		/* Request TX completion. */
@@ -517,7 +524,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (txq->priv->vf)
 				rte_prefetch0((volatile void *)
 					      (uintptr_t)addr);
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Prefetch next buffer data. */
 			if (i + 1 < max) {
 				buf_next_addr =
@@ -568,7 +574,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					  &sges);
 			if (ret.length == (unsigned int)-1)
 				goto stop;
-			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 			/* Put SG list into send queue. */
 			err = txq->send_pending_sg_list
 				(txq->qp,
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [dpdk-dev] [PATCH v3 7/7] mlx: use aligned memory to register regions
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                       ` (5 preceding siblings ...)
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 6/7] mlx5: free buffers immediately after completion Adrien Mazarguil
@ 2016-03-03 14:27     ` Adrien Mazarguil
  2016-03-09 16:28     ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Bruce Richardson
  7 siblings, 0 replies; 24+ messages in thread
From: Adrien Mazarguil @ 2016-03-03 14:27 UTC (permalink / raw)
  To: dev

The first and last memory pool elements are usually cache-aligned but not
page-aligned, particularly when using huge pages.

Hardware performance can be improved significantly by registering memory
regions starting and ending on page boundaries.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4.c      | 58 +++++++++++++++++++++++++++++++++++++-------
 drivers/net/mlx5/mlx5_rxq.c  |  6 +----
 drivers/net/mlx5/mlx5_rxtx.c | 52 ++++++++++++++++++++++++++++++++++++---
 drivers/net/mlx5/mlx5_rxtx.h |  1 +
 4 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 6688f66..3c1f4c2 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -86,6 +86,7 @@
 #include <rte_version.h>
 #include <rte_log.h>
 #include <rte_alarm.h>
+#include <rte_memory.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -1177,6 +1178,52 @@ txq_complete(struct txq *txq)
 	return 0;
 }
 
+/* For best performance, this function should not be inlined. */
+static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, const struct rte_mempool *)
+	__attribute__((noinline));
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ *   Pointer to protection domain.
+ * @param mp
+ *   Pointer to memory pool.
+ *
+ * @return
+ *   Memory region pointer, NULL in case of error.
+ */
+static struct ibv_mr *
+mlx4_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start = mp->elt_va_start;
+	uintptr_t end = mp->elt_va_end;
+	unsigned int i;
+
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
+
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	return ibv_reg_mr(pd,
+			  (void *)start,
+			  end - start,
+			  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+}
+
 /**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
  * the cloned mbuf is allocated is returned instead.
@@ -1228,10 +1275,7 @@ txq_mp2mr(struct txq *txq, const struct rte_mempool *mp)
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (const void *)mp);
-	mr = ibv_reg_mr(txq->priv->pd,
-			(void *)mp->elt_va_start,
-			(mp->elt_va_end - mp->elt_va_start),
-			(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+	mr = mlx4_mp2mr(txq->priv->pd, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq);
@@ -3713,11 +3757,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	DEBUG("%p: %s scattered packets support (%u WRs)",
 	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
 	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = ibv_reg_mr(priv->pd,
-			     (void *)mp->elt_va_start,
-			     (mp->elt_va_end - mp->elt_va_start),
-			     (IBV_ACCESS_LOCAL_WRITE |
-			      IBV_ACCESS_REMOTE_WRITE));
+	tmpl.mr = mlx4_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
 		ret = EINVAL;
 		ERROR("%p: MR creation failure: %s",
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 55d002e..0f5ac65 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1190,11 +1190,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	DEBUG("%p: %s scattered packets support (%u WRs)",
 	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
 	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = ibv_reg_mr(priv->pd,
-			     (void *)mp->elt_va_start,
-			     (mp->elt_va_end - mp->elt_va_start),
-			     (IBV_ACCESS_LOCAL_WRITE |
-			      IBV_ACCESS_REMOTE_WRITE));
+	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
 		ret = EINVAL;
 		ERROR("%p: MR creation failure: %s",
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 622ac17..4c53c7a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -55,6 +55,7 @@
 #include <rte_prefetch.h>
 #include <rte_common.h>
 #include <rte_branch_prediction.h>
+#include <rte_memory.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -135,6 +136,52 @@ txq_complete(struct txq *txq)
 	return 0;
 }
 
+/* For best performance, this function should not be inlined. */
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *)
+	__attribute__((noinline));
+
+/**
+ * Register mempool as a memory region.
+ *
+ * @param pd
+ *   Pointer to protection domain.
+ * @param mp
+ *   Pointer to memory pool.
+ *
+ * @return
+ *   Memory region pointer, NULL in case of error.
+ */
+struct ibv_mr *
+mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start = mp->elt_va_start;
+	uintptr_t end = mp->elt_va_end;
+	unsigned int i;
+
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
+
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (const void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	return ibv_reg_mr(pd,
+			  (void *)start,
+			  end - start,
+			  IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+}
+
 /**
  * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
  * the cloned mbuf is allocated is returned instead.
@@ -186,10 +233,7 @@ txq_mp2mr(struct txq *txq, const struct rte_mempool *mp)
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (const void *)mp);
-	mr = ibv_reg_mr(txq->priv->pd,
-			(void *)mp->elt_va_start,
-			(mp->elt_va_end - mp->elt_va_start),
-			(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+	mr = mlx5_mp2mr(txq->priv->pd, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq);
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index b239ebf..e85cf93 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -317,6 +317,7 @@ void mlx5_tx_queue_release(void *);
 
 /* mlx5_rxtx.c */
 
+struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, const struct rte_mempool *);
 void txq_mp2mr_iter(const struct rte_mempool *, void *);
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
-- 
2.1.4

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4
  2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
                       ` (6 preceding siblings ...)
  2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 7/7] mlx: use aligned memory to register regions Adrien Mazarguil
@ 2016-03-09 16:28     ` Bruce Richardson
  7 siblings, 0 replies; 24+ messages in thread
From: Bruce Richardson @ 2016-03-09 16:28 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

On Thu, Mar 03, 2016 at 03:27:10PM +0100, Adrien Mazarguil wrote:
> This patchset improves the mlx5 PMD performance by doing better prefetching,
> by reordering internal structure fields and by removing a few unnecessary
> operations.
> 
> Note: should be applied after "Add flow director and RX VLAN stripping
> support" to avoid conflicts.
> 
> Changes in v3:
> - None, submitted again due to dependency with previous patchset.
> 
> Changes in v2:
> - Rebased patchset on top of dpdk-next-net/rel_16_04.
> - Fixed missing update for receive function in rxq_rehash().
> - Added a commit to register memory on page boundaries instead of mempool
>   object boundaries for better performance (mlx4 and mlx5).
> 
> Adrien Mazarguil (1):
>   mlx: use aligned memory to register regions
> 
> Nelio Laranjeiro (6):
>   mlx5: prefetch next TX mbuf header and data
>   mlx5: reorder TX/RX queue structure
>   mlx5: remove one indirection level from RX/TX functions
>   mlx5: process offload flags only when requested
>   mlx5: avoid lkey retrieval for inlined packets
>   mlx5: free buffers immediately after completion
>
Applied to dpdk-next-net/rel_16_04

/Bruce

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2016-03-09 16:28 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-29 10:32 [dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5 Adrien Mazarguil
2016-01-29 10:32 ` [dpdk-dev] [PATCH 1/6] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
2016-01-29 10:32 ` [dpdk-dev] [PATCH 2/6] mlx5: reorder TX/RX queue structure Adrien Mazarguil
2016-01-29 10:32 ` [dpdk-dev] [PATCH 3/6] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
2016-01-29 10:32 ` [dpdk-dev] [PATCH 4/6] mlx5: process offload flags only when requested Adrien Mazarguil
2016-01-29 10:32 ` [dpdk-dev] [PATCH 5/6] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
2016-01-29 10:32 ` [dpdk-dev] [PATCH 6/6] mlx5: free buffers immediately after completion Adrien Mazarguil
2016-02-22 18:17 ` [dpdk-dev] [PATCH v2 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 2/7] mlx5: reorder TX/RX queue structure Adrien Mazarguil
2016-02-22 18:17   ` [dpdk-dev] [PATCH v2 3/7] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 4/7] mlx5: process offload flags only when requested Adrien Mazarguil
2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 5/7] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 6/7] mlx5: free buffers immediately after completion Adrien Mazarguil
2016-02-22 18:18   ` [dpdk-dev] [PATCH v2 7/7] mlx: use aligned memory to register regions Adrien Mazarguil
2016-03-03 14:27   ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 1/7] mlx5: prefetch next TX mbuf header and data Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 2/7] mlx5: reorder TX/RX queue structure Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 3/7] mlx5: remove one indirection level from RX/TX functions Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 4/7] mlx5: process offload flags only when requested Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 5/7] mlx5: avoid lkey retrieval for inlined packets Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 6/7] mlx5: free buffers immediately after completion Adrien Mazarguil
2016-03-03 14:27     ` [dpdk-dev] [PATCH v3 7/7] mlx: use aligned memory to register regions Adrien Mazarguil
2016-03-09 16:28     ` [dpdk-dev] [PATCH v3 0/7] Performance optimizations for mlx5 and mlx4 Bruce Richardson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).