DPDK patches and discussions
 help / color / mirror / Atom feed
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 10/25] mlx5: add definitions for data path without Verbs
Date: Thu, 23 Jun 2016 18:53:06 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

These structures and macros extend those exposed by libmlx5 (in mlx5_hw.h)
to let the PMD manage work queue and completion queue elements directly.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_prm.h | 163 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 drivers/net/mlx5/mlx5_prm.h

diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
new file mode 100644
index 0000000..5db219b
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -0,0 +1,163 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2016 6WIND S.A.
+ *   Copyright 2016 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_PRM_H_
+#define RTE_PMD_MLX5_PRM_H_
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/mlx5_hw.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* Get CQE owner bit. */
+#define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK)
+
+/* Get CQE format. */
+#define MLX5_CQE_FORMAT(op_own) (((op_own) & MLX5E_CQE_FORMAT_MASK) >> 2)
+
+/* Get CQE opcode. */
+#define MLX5_CQE_OPCODE(op_own) (((op_own) & 0xf0) >> 4)
+
+/* Get CQE solicited event. */
+#define MLX5_CQE_SE(op_own) (((op_own) >> 1) & 1)
+
+/* Invalidate a CQE. */
+#define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
+
+/* CQE value to inform that VLAN is stripped. */
+#define MLX5_CQE_VLAN_STRIPPED 0x1
+
+/* Maximum number of packets a multi-packet WQE can handle. */
+#define MLX5_MPW_DSEG_MAX 5
+
+/* Room for inline data in regular work queue element. */
+#define MLX5_WQE64_INL_DATA 12
+
+/* Room for inline data in multi-packet WQE. */
+#define MLX5_MWQE64_INL_DATA 28
+
+/* Subset of struct mlx5_wqe_eth_seg. */
+struct mlx5_wqe_eth_seg_small {
+	uint32_t rsvd0;
+	uint8_t	cs_flags;
+	uint8_t	rsvd1;
+	uint16_t mss;
+	uint32_t rsvd2;
+	uint16_t inline_hdr_sz;
+};
+
+/* Regular WQE. */
+struct mlx5_wqe_regular {
+	union {
+		struct mlx5_wqe_ctrl_seg ctrl;
+		uint32_t data[4];
+	} ctrl;
+	struct mlx5_wqe_eth_seg eseg;
+	struct mlx5_wqe_data_seg dseg;
+} __rte_aligned(64);
+
+/* Inline WQE. */
+struct mlx5_wqe_inl {
+	union {
+		struct mlx5_wqe_ctrl_seg ctrl;
+		uint32_t data[4];
+	} ctrl;
+	struct mlx5_wqe_eth_seg eseg;
+	uint32_t byte_cnt;
+	uint8_t data[MLX5_WQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Multi-packet WQE. */
+struct mlx5_wqe_mpw {
+	union {
+		struct mlx5_wqe_ctrl_seg ctrl;
+		uint32_t data[4];
+	} ctrl;
+	struct mlx5_wqe_eth_seg_small eseg;
+	struct mlx5_wqe_data_seg dseg[2];
+} __rte_aligned(64);
+
+/* Multi-packet WQE with inline. */
+struct mlx5_wqe_mpw_inl {
+	union {
+		struct mlx5_wqe_ctrl_seg ctrl;
+		uint32_t data[4];
+	} ctrl;
+	struct mlx5_wqe_eth_seg_small eseg;
+	uint32_t byte_cnt;
+	uint8_t data[MLX5_MWQE64_INL_DATA];
+} __rte_aligned(64);
+
+/* Union of all WQE types. */
+union mlx5_wqe {
+	struct mlx5_wqe_regular wqe;
+	struct mlx5_wqe_inl inl;
+	struct mlx5_wqe_mpw mpw;
+	struct mlx5_wqe_mpw_inl mpw_inl;
+	uint8_t data[64];
+};
+
+/* MPW session status. */
+enum mlx5_mpw_state {
+	MLX5_MPW_STATE_OPENED,
+	MLX5_MPW_INL_STATE_OPENED,
+	MLX5_MPW_STATE_CLOSED,
+};
+
+/* MPW session descriptor. */
+struct mlx5_mpw {
+	enum mlx5_mpw_state state;
+	unsigned int pkts_n;
+	unsigned int len;
+	unsigned int total_len;
+	volatile union mlx5_wqe *wqe;
+	union {
+		volatile struct mlx5_wqe_data_seg *dseg[MLX5_MPW_DSEG_MAX];
+		volatile uint8_t *raw;
+	} data;
+};
+
+/* CQ element structure - should be equal to the cache line size */
+struct mlx5_cqe {
+#if (RTE_CACHE_LINE_SIZE == 128)
+	uint8_t padding[64];
+#endif
+	struct mlx5_cqe64 cqe64;
+};
+
+#endif /* RTE_PMD_MLX5_PRM_H_ */
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 11/25] mlx5: add support for configuration through kvargs
Date: Thu, 23 Jun 2016 19:05:09 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170509.ek0pnTqHYRk588COpCwuuqShR76xy2QTdkb2zcmz9YQ@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

The intent is to replace the remaining compile-time options and environment
variables with a common mean of runtime configuration. This commit only
adds the kvargs handling code, subsequent commits will update the rest.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3f45d84..6027393 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -37,6 +37,7 @@
 #include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <errno.h>
 #include <net/if.h>
 
 /* Verbs header. */
@@ -57,6 +58,7 @@
 #include <rte_ethdev.h>
 #include <rte_pci.h>
 #include <rte_common.h>
+#include <rte_kvargs.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -237,6 +239,71 @@ mlx5_dev_idx(struct rte_pci_addr *pci_addr)
 	return ret;
 }
 
+/**
+ * Verify and store value for device argument.
+ *
+ * @param[in] key
+ *   Key argument to verify.
+ * @param[in] val
+ *   Value associated with key.
+ * @param opaque
+ *   User data.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+static int
+mlx5_args_check(const char *key, const char *val, void *opaque)
+{
+	struct priv *priv = opaque;
+
+	/* No parameters are expected at the moment. */
+	(void)priv;
+	(void)val;
+	WARN("%s: unknown parameter", key);
+	return -EINVAL;
+}
+
+/**
+ * Parse device parameters.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param devargs
+ *   Device arguments structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+mlx5_args(struct priv *priv, struct rte_devargs *devargs)
+{
+	const char **params = (const char *[]){
+		NULL,
+	};
+	struct rte_kvargs *kvlist;
+	int ret = 0;
+	int i;
+
+	if (devargs == NULL)
+		return 0;
+	/* Following UGLY cast is done to pass checkpatch. */
+	kvlist = rte_kvargs_parse(devargs->args, params);
+	if (kvlist == NULL)
+		return 0;
+	/* Process parameters. */
+	for (i = 0; (i != RTE_DIM(params)); ++i) {
+		if (rte_kvargs_count(kvlist, params[i])) {
+			ret = rte_kvargs_process(kvlist, params[i],
+						 mlx5_args_check, priv);
+			if (ret != 0)
+				return ret;
+		}
+	}
+	rte_kvargs_free(kvlist);
+	return 0;
+}
+
 static struct eth_driver mlx5_driver;
 
 /**
@@ -408,6 +475,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
+		err = mlx5_args(priv, pci_dev->devargs);
+		if (err) {
+			ERROR("failed to process device arguments: %s",
+			      strerror(err));
+			goto port_error;
+		}
 		if (ibv_exp_query_device(ctx, &exp_device_attr)) {
 			ERROR("ibv_exp_query_device() failed");
 			goto port_error;
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 12/25] mlx5: add Tx/Rx burst function selection wrapper
Date: Thu, 23 Jun 2016 19:05:10 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170510.NMvFGAPDVvZgdDB1nigV9LhSKd31xB49lwrvJDGqsGM@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

These wrappers are meant to prevent code duplication later.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.h        |  2 ++
 drivers/net/mlx5/mlx5_ethdev.c | 34 ++++++++++++++++++++++++++++------
 drivers/net/mlx5/mlx5_txq.c    |  2 +-
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 935e1b0..3dca03d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -196,6 +196,8 @@ void priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *);
 int mlx5_set_link_down(struct rte_eth_dev *dev);
 int mlx5_set_link_up(struct rte_eth_dev *dev);
 struct priv *mlx5_secondary_data_setup(struct priv *priv);
+void priv_select_tx_function(struct priv *);
+void priv_select_rx_function(struct priv *);
 
 /* mlx5_mac.c */
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 4095a06..759434e 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1099,8 +1099,8 @@ priv_set_link(struct priv *priv, int up)
 		err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
 		if (err)
 			return err;
-		dev->rx_pkt_burst = mlx5_rx_burst;
-		dev->tx_pkt_burst = mlx5_tx_burst;
+		priv_select_tx_function(priv);
+		priv_select_rx_function(priv);
 	} else {
 		err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
 		if (err)
@@ -1290,13 +1290,11 @@ mlx5_secondary_data_setup(struct priv *priv)
 	rte_mb();
 	priv->dev->data = &sd->data;
 	rte_mb();
-	priv->dev->tx_pkt_burst = mlx5_tx_burst;
-	priv->dev->rx_pkt_burst = removed_rx_burst;
+	priv_select_tx_function(priv);
+	priv_select_rx_function(priv);
 	priv_unlock(priv);
 end:
 	/* More sanity checks. */
-	assert(priv->dev->tx_pkt_burst == mlx5_tx_burst);
-	assert(priv->dev->rx_pkt_burst == removed_rx_burst);
 	assert(priv->dev->data == &sd->data);
 	rte_spinlock_unlock(&sd->lock);
 	return priv;
@@ -1307,3 +1305,27 @@ error:
 	rte_spinlock_unlock(&sd->lock);
 	return NULL;
 }
+
+/**
+ * Configure the TX function to use.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+priv_select_tx_function(struct priv *priv)
+{
+	priv->dev->tx_pkt_burst = mlx5_tx_burst;
+}
+
+/**
+ * Configure the RX function to use.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ */
+void
+priv_select_rx_function(struct priv *priv)
+{
+	priv->dev->rx_pkt_burst = mlx5_rx_burst;
+}
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 51210f2..ec4488a 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -478,7 +478,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		      (void *)dev, (void *)txq_ctrl);
 		(*priv->txqs)[idx] = &txq_ctrl->txq;
 		/* Update send callback. */
-		dev->tx_pkt_burst = mlx5_tx_burst;
+		priv_select_tx_function(priv);
 	}
 	priv_unlock(priv);
 	return -ret;
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Yaacov Hazan <yaacovh@mellanox.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>,
	Vasily Philipov <vasilyf@mellanox.com>
Subject: [dpdk-dev] [PATCH v5 13/25] mlx5: refactor Rx data path
Date: Thu, 23 Jun 2016 19:05:11 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170511.H_K5VTenUX3hOslOpdEnmVt9Pv4drhmkRGgF4RF65NM@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

Bypass Verbs to improve RX performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Yaacov Hazan <yaacovh@mellanox.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx5/mlx5_ethdev.c |   4 +-
 drivers/net/mlx5/mlx5_fdir.c   |   2 +-
 drivers/net/mlx5/mlx5_rxq.c    | 303 ++++++++++++++++++++---------------------
 drivers/net/mlx5/mlx5_rxtx.c   | 289 ++++++++++++++++++++-------------------
 drivers/net/mlx5/mlx5_rxtx.h   |  38 +++---
 drivers/net/mlx5/mlx5_vlan.c   |   3 +-
 6 files changed, 326 insertions(+), 313 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 759434e..16b05d3 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1263,7 +1263,9 @@ mlx5_secondary_data_setup(struct priv *priv)
 	}
 	/* RX queues. */
 	for (i = 0; i != nb_rx_queues; ++i) {
-		struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i];
+		struct rxq_ctrl *primary_rxq =
+			container_of((*sd->primary_priv->rxqs)[i],
+				     struct rxq_ctrl, rxq);
 
 		if (primary_rxq == NULL)
 			continue;
diff --git a/drivers/net/mlx5/mlx5_fdir.c b/drivers/net/mlx5/mlx5_fdir.c
index 1850218..73eb00e 100644
--- a/drivers/net/mlx5/mlx5_fdir.c
+++ b/drivers/net/mlx5/mlx5_fdir.c
@@ -431,7 +431,7 @@ priv_get_fdir_queue(struct priv *priv, uint16_t idx)
 	ind_init_attr = (struct ibv_exp_rwq_ind_table_init_attr){
 		.pd = priv->pd,
 		.log_ind_tbl_size = 0,
-		.ind_tbl = &((*priv->rxqs)[idx]->wq),
+		.ind_tbl = &rxq_ctrl->wq,
 		.comp_mask = 0,
 	};
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b474a18..b1d6cfe 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -43,6 +43,8 @@
 #pragma GCC diagnostic ignored "-pedantic"
 #endif
 #include <infiniband/verbs.h>
+#include <infiniband/arch.h>
+#include <infiniband/mlx5_hw.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -373,8 +375,13 @@ priv_create_hash_rxqs(struct priv *priv)
 		DEBUG("indirection table extended to assume %u WQs",
 		      priv->reta_idx_n);
 	}
-	for (i = 0; (i != priv->reta_idx_n); ++i)
-		wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
+	for (i = 0; (i != priv->reta_idx_n); ++i) {
+		struct rxq_ctrl *rxq_ctrl;
+
+		rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
+					struct rxq_ctrl, rxq);
+		wqs[i] = rxq_ctrl->wq;
+	}
 	/* Get number of hash RX queues to configure. */
 	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
 		hash_rxqs_n += ind_table_init[i].hash_types_n;
@@ -638,21 +645,13 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 	       struct rte_mbuf **pool)
 {
 	unsigned int i;
-	struct rxq_elt (*elts)[elts_n] =
-		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-				  rxq_ctrl->socket);
 	int ret = 0;
 
-	if (elts == NULL) {
-		ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
-		ret = ENOMEM;
-		goto error;
-	}
 	/* For each WR (packet). */
 	for (i = 0; (i != elts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct ibv_sge *sge = &(*elts)[i].sge;
 		struct rte_mbuf *buf;
+		volatile struct mlx5_wqe_data_seg *scat =
+			&(*rxq_ctrl->rxq.wqes)[i];
 
 		if (pool != NULL) {
 			buf = *(pool++);
@@ -666,40 +665,36 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 			ret = ENOMEM;
 			goto error;
 		}
-		elt->buf = buf;
 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
 		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
 		/* Buffer is supposed to be empty. */
 		assert(rte_pktmbuf_data_len(buf) == 0);
 		assert(rte_pktmbuf_pkt_len(buf) == 0);
-		/* sge->addr must be able to store a pointer. */
-		assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-		/* SGE keeps its headroom. */
-		sge->addr = (uintptr_t)
-			((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
-		sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
-		sge->lkey = rxq_ctrl->mr->lkey;
-		/* Redundant check for tailroom. */
-		assert(sge->length == rte_pktmbuf_tailroom(buf));
+		assert(!buf->next);
+		PORT(buf) = rxq_ctrl->rxq.port_id;
+		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
+		PKT_LEN(buf) = DATA_LEN(buf);
+		NB_SEGS(buf) = 1;
+		/* scat->addr must be able to store a pointer. */
+		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+		*scat = (struct mlx5_wqe_data_seg){
+			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = htonl(DATA_LEN(buf)),
+			.lkey = htonl(rxq_ctrl->mr->lkey),
+		};
+		(*rxq_ctrl->rxq.elts)[i] = buf;
 	}
 	DEBUG("%p: allocated and configured %u single-segment WRs",
 	      (void *)rxq_ctrl, elts_n);
-	rxq_ctrl->rxq.elts_n = elts_n;
-	rxq_ctrl->rxq.elts_head = 0;
-	rxq_ctrl->rxq.elts = elts;
 	assert(ret == 0);
 	return 0;
 error:
-	if (elts != NULL) {
-		assert(pool == NULL);
-		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-			struct rxq_elt *elt = &(*elts)[i];
-			struct rte_mbuf *buf = elt->buf;
-
-			if (buf != NULL)
-				rte_pktmbuf_free_seg(buf);
-		}
-		rte_free(elts);
+	assert(pool == NULL);
+	elts_n = i;
+	for (i = 0; (i != elts_n); ++i) {
+		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+		(*rxq_ctrl->rxq.elts)[i] = NULL;
 	}
 	DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
 	assert(ret > 0);
@@ -716,22 +711,16 @@ static void
 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
 {
 	unsigned int i;
-	unsigned int elts_n = rxq_ctrl->rxq.elts_n;
-	struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;
 
 	DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
-	rxq_ctrl->rxq.elts_n = 0;
-	rxq_ctrl->rxq.elts = NULL;
-	if (elts == NULL)
+	if (rxq_ctrl->rxq.elts == NULL)
 		return;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct rte_mbuf *buf = elt->buf;
 
-		if (buf != NULL)
-			rte_pktmbuf_free_seg(buf);
+	for (i = 0; (i != rxq_ctrl->rxq.elts_n); ++i) {
+		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
+		(*rxq_ctrl->rxq.elts)[i] = NULL;
 	}
-	rte_free(elts);
 }
 
 /**
@@ -749,42 +738,40 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 
 	DEBUG("cleaning up %p", (void *)rxq_ctrl);
 	rxq_free_elts(rxq_ctrl);
-	rxq_ctrl->rxq.poll = NULL;
-	rxq_ctrl->rxq.recv = NULL;
 	if (rxq_ctrl->if_wq != NULL) {
-		assert(rxq_ctrl->rxq.priv != NULL);
-		assert(rxq_ctrl->rxq.priv->ctx != NULL);
-		assert(rxq_ctrl->rxq.wq != NULL);
+		assert(rxq_ctrl->priv != NULL);
+		assert(rxq_ctrl->priv->ctx != NULL);
+		assert(rxq_ctrl->wq != NULL);
 		params = (struct ibv_exp_release_intf_params){
 			.comp_mask = 0,
 		};
-		claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+		claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
 						rxq_ctrl->if_wq,
 						&params));
 	}
 	if (rxq_ctrl->if_cq != NULL) {
-		assert(rxq_ctrl->rxq.priv != NULL);
-		assert(rxq_ctrl->rxq.priv->ctx != NULL);
-		assert(rxq_ctrl->rxq.cq != NULL);
+		assert(rxq_ctrl->priv != NULL);
+		assert(rxq_ctrl->priv->ctx != NULL);
+		assert(rxq_ctrl->cq != NULL);
 		params = (struct ibv_exp_release_intf_params){
 			.comp_mask = 0,
 		};
-		claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
+		claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
 						rxq_ctrl->if_cq,
 						&params));
 	}
-	if (rxq_ctrl->rxq.wq != NULL)
-		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
-	if (rxq_ctrl->rxq.cq != NULL)
-		claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
+	if (rxq_ctrl->wq != NULL)
+		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
+	if (rxq_ctrl->cq != NULL)
+		claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
 	if (rxq_ctrl->rd != NULL) {
 		struct ibv_exp_destroy_res_domain_attr attr = {
 			.comp_mask = 0,
 		};
 
-		assert(rxq_ctrl->rxq.priv != NULL);
-		assert(rxq_ctrl->rxq.priv->ctx != NULL);
-		claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
+		assert(rxq_ctrl->priv != NULL);
+		assert(rxq_ctrl->priv->ctx != NULL);
+		claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
 						      rxq_ctrl->rd,
 						      &attr));
 	}
@@ -811,14 +798,13 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 int
 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 {
-	struct priv *priv = rxq_ctrl->rxq.priv;
+	struct priv *priv = rxq_ctrl->priv;
 	struct rxq_ctrl tmpl = *rxq_ctrl;
 	unsigned int mbuf_n;
 	unsigned int desc_n;
 	struct rte_mbuf **pool;
 	unsigned int i, k;
 	struct ibv_exp_wq_attr mod;
-	struct rxq_elt (*elts)[tmpl.rxq.elts_n];
 	int err;
 
 	DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
@@ -841,7 +827,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
 		.wq_state = IBV_EXP_WQS_RESET,
 	};
-	err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+	err = ibv_exp_modify_wq(tmpl.wq, &mod);
 	if (err) {
 		ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
 		assert(err > 0);
@@ -855,60 +841,33 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 	}
 	/* Snatch mbufs from original queue. */
 	k = 0;
-	elts = rxq_ctrl->rxq.elts;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct rte_mbuf *buf = elt->buf;
-
-		pool[k++] = buf;
-	}
+	for (i = 0; (i != desc_n); ++i)
+		pool[k++] = (*rxq_ctrl->rxq.elts)[i];
 	assert(k == mbuf_n);
-	tmpl.rxq.elts_n = 0;
-	tmpl.rxq.elts = NULL;
-	assert((void *)&tmpl.rxq.elts == NULL);
-	err = rxq_alloc_elts(&tmpl, desc_n, pool);
-	if (err) {
-		ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
-		rte_free(pool);
-		assert(err > 0);
-		return err;
-	}
-	assert(tmpl.rxq.elts_n == desc_n);
 	rte_free(pool);
-	/* Clean up original data. */
-	rxq_ctrl->rxq.elts_n = 0;
-	rte_free(rxq_ctrl->rxq.elts);
-	rxq_ctrl->rxq.elts = NULL;
 	/* Change queue state to ready. */
 	mod = (struct ibv_exp_wq_attr){
 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
 		.wq_state = IBV_EXP_WQS_RDY,
 	};
-	err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+	err = ibv_exp_modify_wq(tmpl.wq, &mod);
 	if (err) {
 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
 		      (void *)dev, strerror(err));
 		goto error;
 	}
 	/* Post SGEs. */
-	assert(tmpl.if_wq != NULL);
-	elts = tmpl.rxq.elts;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-		err = tmpl.if_wq->recv_burst(
-			tmpl.rxq.wq,
-			&(*elts)[i].sge,
-			1);
-		if (err)
-			break;
-	}
+	err = rxq_alloc_elts(&tmpl, desc_n, pool);
 	if (err) {
-		ERROR("%p: failed to post SGEs with error %d",
-		      (void *)dev, err);
-		/* Set err because it does not contain a valid errno value. */
-		err = EIO;
-		goto error;
+		ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
+		rte_free(pool);
+		assert(err > 0);
+		return err;
 	}
-	tmpl.rxq.recv = tmpl.if_wq->recv_burst;
+	/* Update doorbell counter. */
+	rxq_ctrl->rxq.rq_ci = desc_n;
+	rte_wmb();
+	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
 error:
 	*rxq_ctrl = tmpl;
 	assert(err >= 0);
@@ -916,6 +875,45 @@ error:
 }
 
 /**
+ * Initialize RX queue.
+ *
+ * @param tmpl
+ *   Pointer to RX queue control template.
+ * @param rxq_ctrl
+ *   Pointer to RX queue control.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static inline int
+rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+{
+	struct ibv_cq *ibcq = tmpl->cq;
+	struct mlx5_cq *cq = to_mxxx(cq, cq);
+	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+
+	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
+		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
+		return EINVAL;
+	}
+	tmpl->rxq.rq_db = rwq->rq.db;
+	tmpl->rxq.cq_ci = 0;
+	tmpl->rxq.rq_ci = 0;
+	tmpl->rxq.cq_db = cq->dbrec;
+	tmpl->rxq.wqes =
+		(volatile struct mlx5_wqe_data_seg (*)[])
+		(uintptr_t)rwq->rq.buff;
+	tmpl->rxq.cqes =
+		(volatile struct mlx5_cqe (*)[])
+		(uintptr_t)cq->active_buf->buf;
+	tmpl->rxq.elts =
+		(struct rte_mbuf *(*)[tmpl->rxq.elts_n])
+		((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+	return 0;
+}
+
+/**
  * Configure a RX queue.
  *
  * @param dev
@@ -935,15 +933,16 @@ error:
  *   0 on success, errno value on failure.
  */
 int
-rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
-	  unsigned int socket, const struct rte_eth_rxconf *conf,
-	  struct rte_mempool *mp)
+rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
+	       uint16_t desc, unsigned int socket,
+	       const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct rxq_ctrl tmpl = {
+		.priv = priv,
 		.socket = socket,
 		.rxq = {
-			.priv = priv,
+			.elts_n = desc,
 			.mp = mp,
 		},
 	};
@@ -953,17 +952,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 		struct ibv_exp_cq_init_attr cq;
 		struct ibv_exp_res_domain_init_attr rd;
 		struct ibv_exp_wq_init_attr wq;
+		struct ibv_exp_cq_attr cq_attr;
 	} attr;
 	enum ibv_exp_query_intf_status status;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-	struct rxq_elt (*elts)[desc];
 	int ret = 0;
-	unsigned int i;
-	unsigned int cq_size = desc;
 
 	(void)conf; /* Thresholds configuration (ignored). */
 	if (desc == 0) {
-		ERROR("%p: invalid number of RX descriptors", (void *)dev);
+		ERROR("%p: invalid number of RX descriptors (must be a"
+		      " multiple of 2)", (void *)dev);
 		return EINVAL;
 	}
 	/* Toggle RX checksum offload if hardware supports it. */
@@ -998,9 +996,9 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
 		.res_domain = tmpl.rd,
 	};
-	tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
-					&attr.cq);
-	if (tmpl.rxq.cq == NULL) {
+	tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+				    &attr.cq);
+	if (tmpl.cq == NULL) {
 		ret = ENOMEM;
 		ERROR("%p: CQ creation failure: %s",
 		      (void *)dev, strerror(ret));
@@ -1017,13 +1015,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 		.wq_context = NULL, /* Could be useful in the future. */
 		.wq_type = IBV_EXP_WQT_RQ,
 		/* Max number of outstanding WRs. */
-		.max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
+		.max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
 				priv->device_attr.max_qp_wr :
-				(int)cq_size),
+				(int)desc),
 		/* Max number of scatter/gather elements in a WR. */
 		.max_recv_sge = 1,
 		.pd = priv->pd,
-		.cq = tmpl.rxq.cq,
+		.cq = tmpl.cq,
 		.comp_mask =
 			IBV_EXP_CREATE_WQ_RES_DOMAIN |
 			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
@@ -1066,19 +1064,13 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 		     " up to date",
 		     (void *)dev);
 
-	tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
-	if (tmpl.rxq.wq == NULL) {
+	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+	if (tmpl.wq == NULL) {
 		ret = (errno ? errno : EINVAL);
 		ERROR("%p: WQ creation failure: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
-	ret = rxq_alloc_elts(&tmpl, desc, NULL);
-	if (ret) {
-		ERROR("%p: RXQ allocation failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
 	/* Save port ID. */
 	tmpl.rxq.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1086,7 +1078,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 		.intf_scope = IBV_EXP_INTF_GLOBAL,
 		.intf_version = 1,
 		.intf = IBV_EXP_INTF_CQ,
-		.obj = tmpl.rxq.cq,
+		.obj = tmpl.cq,
 	};
 	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
 	if (tmpl.if_cq == NULL) {
@@ -1097,7 +1089,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 	attr.params = (struct ibv_exp_query_intf_params){
 		.intf_scope = IBV_EXP_INTF_GLOBAL,
 		.intf = IBV_EXP_INTF_WQ,
-		.obj = tmpl.rxq.wq,
+		.obj = tmpl.wq,
 	};
 	tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
 	if (tmpl.if_wq == NULL) {
@@ -1110,38 +1102,34 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
 		.wq_state = IBV_EXP_WQS_RDY,
 	};
-	ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
+	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
 	if (ret) {
 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
-	/* Post SGEs. */
-	elts = tmpl.rxq.elts;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
-		ret = tmpl.if_wq->recv_burst(
-			tmpl.rxq.wq,
-			&(*elts)[i].sge,
-			1);
-		if (ret)
-			break;
+	ret = rxq_setup(&tmpl, rxq_ctrl);
+	if (ret) {
+		ERROR("%p: cannot initialize RX queue structure: %s",
+		      (void *)dev, strerror(ret));
+		goto error;
 	}
+	ret = rxq_alloc_elts(&tmpl, desc, NULL);
 	if (ret) {
-		ERROR("%p: failed to post SGEs with error %d",
-		      (void *)dev, ret);
-		/* Set ret because it does not contain a valid errno value. */
-		ret = EIO;
+		ERROR("%p: RXQ allocation failed: %s",
+		      (void *)dev, strerror(ret));
 		goto error;
 	}
 	/* Clean up rxq in case we're reinitializing it. */
 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
 	rxq_cleanup(rxq_ctrl);
 	*rxq_ctrl = tmpl;
+	/* Update doorbell counter. */
+	rxq_ctrl->rxq.rq_ci = desc;
+	rte_wmb();
+	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
 	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
 	assert(ret == 0);
-	/* Assign function in queue. */
-	rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
-	rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
 	return 0;
 error:
 	rxq_cleanup(&tmpl);
@@ -1175,14 +1163,19 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 {
 	struct priv *priv = dev->data->dev_private;
 	struct rxq *rxq = (*priv->rxqs)[idx];
-	struct rxq_ctrl *rxq_ctrl;
+	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
 	int ret;
 
 	if (mlx5_is_secondary())
 		return -E_RTE_SECONDARY;
 
 	priv_lock(priv);
-	rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		WARN("%p: increased number of descriptors in RX queue %u"
+		     " to the next power of two (%d)",
+		     (void *)dev, idx, desc);
+	}
 	DEBUG("%p: configuring queue %u for %u descriptors",
 	      (void *)dev, idx, desc);
 	if (idx >= priv->rxqs_n) {
@@ -1201,8 +1194,9 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		(*priv->rxqs)[idx] = NULL;
 		rxq_cleanup(rxq_ctrl);
 	} else {
-		rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
-					     socket);
+		rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
+					     desc * sizeof(struct rte_mbuf *),
+					     0, socket);
 		if (rxq_ctrl == NULL) {
 			ERROR("%p: unable to allocate queue index %u",
 			      (void *)dev, idx);
@@ -1210,7 +1204,7 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			return -ENOMEM;
 		}
 	}
-	ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
+	ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
 	if (ret)
 		rte_free(rxq_ctrl);
 	else {
@@ -1245,12 +1239,12 @@ mlx5_rx_queue_release(void *dpdk_rxq)
 	if (rxq == NULL)
 		return;
 	rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
-	priv = rxq->priv;
+	priv = rxq_ctrl->priv;
 	priv_lock(priv);
 	for (i = 0; (i != priv->rxqs_n); ++i)
 		if ((*priv->rxqs)[i] == rxq) {
 			DEBUG("%p: removing RX queue %p from list",
-			      (void *)priv->dev, (void *)rxq);
+			      (void *)priv->dev, (void *)rxq_ctrl);
 			(*priv->rxqs)[i] = NULL;
 			break;
 		}
@@ -1280,7 +1274,8 @@ mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
 			      uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	struct priv *priv = mlx5_secondary_data_setup(rxq->priv);
+	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
+	struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
 	struct priv *primary_priv;
 	unsigned int index;
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 6a0d707..f2d00bf 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -42,6 +42,8 @@
 #pragma GCC diagnostic ignored "-pedantic"
 #endif
 #include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
+#include <infiniband/arch.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -55,7 +57,7 @@
 #include <rte_prefetch.h>
 #include <rte_common.h>
 #include <rte_branch_prediction.h>
-#include <rte_memory.h>
+#include <rte_ether.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -65,6 +67,47 @@
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+static inline volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+	  unsigned int cqes_n, uint16_t *ci)
+	  __attribute__((always_inline));
+
+static inline int
+rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+
+static volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+	  unsigned int cqes_n, uint16_t *ci)
+{
+	volatile struct mlx5_cqe64 *cqe;
+	uint16_t idx = *ci;
+	uint8_t op_own;
+
+	cqe = &cqes[idx & (cqes_n - 1)].cqe64;
+	op_own = cqe->op_own;
+	if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
+		return NULL;
+	} else if (unlikely(op_own & 0x80)) {
+		switch (op_own >> 4) {
+		case MLX5_CQE_INVALID:
+			return NULL; /* No CQE */
+		case MLX5_CQE_REQ_ERR:
+			return cqe;
+		case MLX5_CQE_RESP_ERR:
+			++(*ci);
+			return NULL;
+		default:
+			return NULL;
+		}
+	}
+	if (cqe) {
+		*ci = idx + 1;
+		return cqe;
+	}
+	return NULL;
+}
 
 /**
  * Manage TX completions.
@@ -390,8 +433,8 @@ stop:
 /**
  * Translate RX completion flags to packet type.
  *
- * @param flags
- *   RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ *   Pointer to CQE.
  *
  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
  *
@@ -399,11 +442,13 @@ stop:
  *   Packet type for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
 {
 	uint32_t pkt_type;
+	uint8_t flags = cqe->l4_hdr_type_etc;
+	uint8_t info = cqe->rsvd0[0];
 
-	if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
+	if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
 		pkt_type =
 			TRANSPOSE(flags,
 				  IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
@@ -420,11 +465,11 @@ rxq_cq_to_pkt_type(uint32_t flags)
 	else
 		pkt_type =
 			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IPV4_PACKET,
-				  RTE_PTYPE_L3_IPV4) |
+				  MLX5_CQE_L3_HDR_TYPE_IPV6,
+				  RTE_PTYPE_L3_IPV6) |
 			TRANSPOSE(flags,
-				  IBV_EXP_CQ_RX_IPV6_PACKET,
-				  RTE_PTYPE_L3_IPV6);
+				  MLX5_CQE_L3_HDR_TYPE_IPV4,
+				  RTE_PTYPE_L3_IPV4);
 	return pkt_type;
 }
 
@@ -433,50 +478,69 @@ rxq_cq_to_pkt_type(uint32_t flags)
  *
  * @param[in] rxq
  *   Pointer to RX queue structure.
- * @param flags
- *   RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ *   Pointer to CQE.
  *
  * @return
  *   Offload flags (ol_flags) for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
 {
 	uint32_t ol_flags = 0;
+	uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
+	uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
+	uint8_t info = cqe->rsvd0[0];
 
-	if (rxq->csum) {
-		/* Set IP checksum flag only for IPv4/IPv6 packets. */
-		if (flags &
-		    (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
-			ol_flags |=
-				TRANSPOSE(~flags,
-					IBV_EXP_CQ_RX_IP_CSUM_OK,
-					PKT_RX_IP_CKSUM_BAD);
-		/* Set L4 checksum flag only for TCP/UDP packets. */
-		if (flags &
-		    (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
-			ol_flags |=
-				TRANSPOSE(~flags,
-					IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
-					PKT_RX_L4_CKSUM_BAD);
-	}
+	if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
+	    (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
+		ol_flags |=
+			(!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
+			 PKT_RX_IP_CKSUM_BAD);
+	if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
+	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
+	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
+	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
+		ol_flags |=
+			(!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
+			 PKT_RX_L4_CKSUM_BAD);
 	/*
 	 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
 	 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
 	 * (its value is 0).
 	 */
-	if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
+	if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
 		ol_flags |=
-			TRANSPOSE(~flags,
+			TRANSPOSE(~cqe->l4_hdr_type_etc,
 				  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
 				  PKT_RX_IP_CKSUM_BAD) |
-			TRANSPOSE(~flags,
+			TRANSPOSE(~cqe->l4_hdr_type_etc,
 				  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
 				  PKT_RX_L4_CKSUM_BAD);
 	return ol_flags;
 }
 
 /**
+ * Get size of the next packet.
+ *
+ * @param rxq
+ *   RX queue to fetch packet from.
+ *
+ * @return
+ *   Packet size in bytes.
+ */
+static inline int __attribute__((always_inline))
+rx_poll_len(struct rxq *rxq)
+{
+	volatile struct mlx5_cqe64 *cqe;
+
+	cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
+	if (cqe)
+		return ntohl(cqe->byte_cnt);
+	return 0;
+}
+
+/**
  * DPDK callback for RX.
  *
  * @param dpdk_rxq
@@ -492,133 +556,84 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
 uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_sge sges[pkts_n];
-	unsigned int i;
+	struct rxq *rxq = dpdk_rxq;
 	unsigned int pkts_ret = 0;
-	int ret;
+	unsigned int i;
+	unsigned int rq_ci = rxq->rq_ci;
+	const unsigned int elts_n = rxq->elts_n;
+	const unsigned int wqe_cnt = elts_n - 1;
 
 	for (i = 0; (i != pkts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[elts_head];
-		unsigned int len;
-		struct rte_mbuf *seg = elt->buf;
+		unsigned int idx = rq_ci & wqe_cnt;
 		struct rte_mbuf *rep;
-		uint32_t flags;
-		uint16_t vlan_tci;
+		struct rte_mbuf *pkt;
+		unsigned int len;
+		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
+		volatile struct mlx5_cqe64 *cqe =
+			&(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
 
-		/* Sanity checks. */
-		assert(seg != NULL);
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		/*
-		 * Fetch initial bytes of packet descriptor into a
-		 * cacheline while allocating rep.
-		 */
-		rte_mbuf_prefetch_part1(seg);
-		rte_mbuf_prefetch_part2(seg);
-		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
-		if (unlikely(ret < 0)) {
-			struct ibv_wc wc;
-			int wcs_n;
-
-			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
-			      (void *)rxq, ret);
-			/* ibv_poll_cq() must be used in case of failure. */
-			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
-			if (unlikely(wcs_n == 0))
-				break;
-			if (unlikely(wcs_n < 0)) {
-				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
-				      (void *)rxq, wcs_n);
-				break;
-			}
-			assert(wcs_n == 1);
-			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
-				/* Whatever, just repost the offending WR. */
-				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
-				      " completion status (%d): %s",
-				      (void *)rxq, wc.wr_id, wc.status,
-				      ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-				/* Increment dropped packets counter. */
-				++rxq->stats.idropped;
-#endif
-				/* Add SGE to array for repost. */
-				sges[i] = elt->sge;
-				goto repost;
-			}
-			ret = wc.byte_len;
-		}
-		if (ret == 0)
-			break;
-		assert(ret >= (rxq->crc_present << 2));
-		len = ret - (rxq->crc_present << 2);
+		pkt = (*rxq->elts)[idx];
+		rte_prefetch0(cqe);
 		rep = rte_mbuf_raw_alloc(rxq->mp);
 		if (unlikely(rep == NULL)) {
-			/*
-			 * Unable to allocate a replacement mbuf,
-			 * repost WR.
-			 */
-			DEBUG("rxq=%p: can't allocate a new mbuf",
-			      (void *)rxq);
-			/* Increment out of memory counters. */
 			++rxq->stats.rx_nombuf;
-			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-			goto repost;
+			break;
 		}
-
-		/* Reconfigure sge to use rep instead of seg. */
-		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-		elt->buf = rep;
-
-		/* Add SGE to array for repost. */
-		sges[i] = elt->sge;
-
-		/* Update seg information. */
-		SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
-		NB_SEGS(seg) = 1;
-		PORT(seg) = rxq->port_id;
-		NEXT(seg) = NULL;
-		PKT_LEN(seg) = len;
-		DATA_LEN(seg) = len;
-		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
-			seg->packet_type = rxq_cq_to_pkt_type(flags);
-			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				seg->ol_flags |= PKT_RX_VLAN_PKT;
-				seg->vlan_tci = vlan_tci;
+		SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
+		NB_SEGS(rep) = 1;
+		PORT(rep) = rxq->port_id;
+		NEXT(rep) = NULL;
+		len = rx_poll_len(rxq);
+		if (unlikely(len == 0)) {
+			rte_mbuf_refcnt_set(rep, 0);
+			__rte_mbuf_raw_free(rep);
+			break;
+		}
+		/*
+		 * Fill NIC descriptor with the new buffer.  The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		wqe->addr = htonll((uintptr_t)rep->buf_addr +
+				   RTE_PKTMBUF_HEADROOM);
+		(*rxq->elts)[idx] = rep;
+		/* Update pkt information. */
+		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+		    rxq->crc_present) {
+			if (rxq->csum) {
+				pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+				pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
+			}
+			if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
+				pkt->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt->vlan_tci = ntohs(cqe->vlan_info);
 			}
+			if (rxq->crc_present)
+				len -= ETHER_CRC_LEN;
 		}
-		/* Return packet. */
-		*(pkts++) = seg;
-		++pkts_ret;
+		PKT_LEN(pkt) = len;
+		DATA_LEN(pkt) = len;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment bytes counter. */
 		rxq->stats.ibytes += len;
 #endif
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
+		/* Return packet. */
+		*(pkts++) = pkt;
+		++pkts_ret;
+		++rq_ci;
 	}
-	if (unlikely(i == 0))
+	if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
 		return 0;
 	/* Repost WRs. */
 #ifdef DEBUG_RECV
 	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
 #endif
-	ret = rxq->recv(rxq->wq, sges, i);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: recv_burst(): failed (ret=%d)",
-		      (void *)rxq->priv,
-		      ret);
-		abort();
-	}
-	rxq->elts_head = elts_head;
+	/* Update the consumer index. */
+	rxq->rq_ci = rq_ci;
+	rte_wmb();
+	*rxq->cq_db = htonl(rxq->cq_ci);
+	rte_wmb();
+	*rxq->rq_db = htonl(rxq->rq_ci);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	/* Increment packets counter. */
 	rxq->stats.ipackets += pkts_ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 570345b..1827123 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -43,6 +43,7 @@
 #pragma GCC diagnostic ignored "-pedantic"
 #endif
 #include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -61,6 +62,7 @@
 #include "mlx5.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
+#include "mlx5_prm.h"
 
 struct mlx5_rxq_stats {
 	unsigned int idx; /**< Mapping index. */
@@ -81,12 +83,6 @@ struct mlx5_txq_stats {
 	uint64_t odropped; /**< Total of packets not sent when TX ring full. */
 };
 
-/* RX element. */
-struct rxq_elt {
-	struct ibv_sge sge; /* Scatter/Gather Element. */
-	struct rte_mbuf *buf; /* SGE buffer. */
-};
-
 /* Flow director queue structure. */
 struct fdir_queue {
 	struct ibv_qp *qp; /* Associated RX QP. */
@@ -97,25 +93,28 @@ struct priv;
 
 /* RX queue descriptor. */
 struct rxq {
-	struct priv *priv; /* Back pointer to private data. */
-	struct rte_mempool *mp; /* Memory Pool for allocations. */
-	struct ibv_cq *cq; /* Completion Queue. */
-	struct ibv_exp_wq *wq; /* Work Queue. */
-	int32_t (*poll)(); /* Verbs poll function. */
-	int32_t (*recv)(); /* Verbs receive function. */
-	unsigned int port_id; /* Port ID for incoming packets. */
-	unsigned int elts_n; /* (*elts)[] length. */
-	unsigned int elts_head; /* Current index in (*elts)[]. */
 	unsigned int csum:1; /* Enable checksum offloading. */
 	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
-	struct rxq_elt (*elts)[]; /* RX elements. */
-	struct mlx5_rxq_stats stats; /* RX queue counters. */
+	uint16_t rq_ci;
+	uint16_t cq_ci;
+	uint16_t elts_n;
+	uint16_t port_id;
+	volatile struct mlx5_wqe_data_seg(*wqes)[];
+	volatile struct mlx5_cqe(*cqes)[];
+	volatile uint32_t *rq_db;
+	volatile uint32_t *cq_db;
+	struct rte_mbuf *(*elts)[];
+	struct rte_mempool *mp;
+	struct mlx5_rxq_stats stats;
 } __rte_cache_aligned;
 
 /* RX queue control descriptor. */
 struct rxq_ctrl {
+	struct priv *priv; /* Back pointer to private data. */
+	struct ibv_cq *cq; /* Completion Queue. */
+	struct ibv_exp_wq *wq; /* Work Queue. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	struct fdir_queue fdir_queue; /* Flow director queue. */
 	struct ibv_mr *mr; /* Memory Region (for mp). */
@@ -284,8 +283,9 @@ int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
 int priv_rehash_flows(struct priv *);
 void rxq_cleanup(struct rxq_ctrl *);
 int rxq_rehash(struct rte_eth_dev *, struct rxq_ctrl *);
-int rxq_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t, unsigned int,
-	      const struct rte_eth_rxconf *, struct rte_mempool *);
+int rxq_ctrl_setup(struct rte_eth_dev *, struct rxq_ctrl *, uint16_t,
+		   unsigned int, const struct rte_eth_rxconf *,
+		   struct rte_mempool *);
 int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
 			const struct rte_eth_rxconf *, struct rte_mempool *);
 void mlx5_rx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 3b9b771..4719e69 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -144,6 +144,7 @@ static void
 priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
 {
 	struct rxq *rxq = (*priv->rxqs)[idx];
+	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
 	struct ibv_exp_wq_attr mod;
 	uint16_t vlan_offloads =
 		(on ? IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : 0) |
@@ -157,7 +158,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
 		.vlan_offloads = vlan_offloads,
 	};
 
-	err = ibv_exp_modify_wq(rxq->wq, &mod);
+	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
 	if (err) {
 		ERROR("%p: failed to modified stripping mode: %s",
 		      (void *)priv, strerror(err));
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Yaacov Hazan <yaacovh@mellanox.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 14/25] mlx5: refactor Tx data path
Date: Thu, 23 Jun 2016 19:05:12 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170512.m202zBHi8ZBzcGvaeyu9PXyu7QNiw3bojI6m0ZJ5AwA@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

Bypass Verbs to improve Tx performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Yaacov Hazan <yaacovh@mellanox.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/Makefile      |   5 -
 drivers/net/mlx5/mlx5_ethdev.c |  10 +-
 drivers/net/mlx5/mlx5_mr.c     |   4 +-
 drivers/net/mlx5/mlx5_rxtx.c   | 361 ++++++++++++++++++++++-------------------
 drivers/net/mlx5/mlx5_rxtx.h   |  52 +++---
 drivers/net/mlx5/mlx5_txq.c    | 219 +++++++++++++------------
 6 files changed, 347 insertions(+), 304 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index dc99797..66687e8 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -106,11 +106,6 @@ mlx5_autoconf.h.new: FORCE
 mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q sh -- '$<' '$@' \
-		HAVE_VERBS_VLAN_INSERTION \
-		infiniband/verbs.h \
-		enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \
-		$(AUTOCONF_OUTPUT)
-	$Q sh -- '$<' '$@' \
 		HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \
 		infiniband/verbs_exp.h \
 		enum IBV_EXP_CQ_COMPRESSED_CQE \
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 16b05d3..47e64b2 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1242,11 +1242,11 @@ mlx5_secondary_data_setup(struct priv *priv)
 		txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0,
 					     primary_txq_ctrl->socket);
 		if (txq_ctrl != NULL) {
-			if (txq_setup(priv->dev,
-				      primary_txq_ctrl,
-				      primary_txq->elts_n,
-				      primary_txq_ctrl->socket,
-				      NULL) == 0) {
+			if (txq_ctrl_setup(priv->dev,
+					   primary_txq_ctrl,
+					   primary_txq->elts_n,
+					   primary_txq_ctrl->socket,
+					   NULL) == 0) {
 				txq_ctrl->txq.stats.idx =
 					primary_txq->stats.idx;
 				tx_queues[i] = &txq_ctrl->txq;
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 1d8bf72..67dfefa 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -190,7 +190,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq_ctrl, mp->name, (void *)mp);
-	mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp);
+	mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq_ctrl);
@@ -209,7 +209,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx)
 	/* Store the new entry. */
 	txq_ctrl->txq.mp2mr[idx].mp = mp;
 	txq_ctrl->txq.mp2mr[idx].mr = mr;
-	txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey;
+	txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey);
 	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
 	      (void *)txq_ctrl, mp->name, (void *)mp,
 	      txq_ctrl->txq.mp2mr[idx].lkey);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index f2d00bf..2372fce 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -119,68 +119,52 @@ get_cqe64(volatile struct mlx5_cqe cqes[],
  *
  * @param txq
  *   Pointer to TX queue structure.
- *
- * @return
- *   0 on success, -1 on failure.
  */
-static int
+static void
 txq_complete(struct txq *txq)
 {
-	unsigned int elts_comp = txq->elts_comp;
-	unsigned int elts_tail = txq->elts_tail;
-	unsigned int elts_free = txq->elts_tail;
 	const unsigned int elts_n = txq->elts_n;
-	int wcs_n;
-
-	if (unlikely(elts_comp == 0))
-		return 0;
-#ifdef DEBUG_SEND
-	DEBUG("%p: processing %u work requests completions",
-	      (void *)txq, elts_comp);
-#endif
-	wcs_n = txq->poll_cnt(txq->cq, elts_comp);
-	if (unlikely(wcs_n == 0))
-		return 0;
-	if (unlikely(wcs_n < 0)) {
-		DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
-		      (void *)txq, wcs_n);
-		return -1;
+	const unsigned int cqe_n = txq->cqe_n;
+	uint16_t elts_free = txq->elts_tail;
+	uint16_t elts_tail;
+	uint16_t cq_ci = txq->cq_ci;
+	unsigned int wqe_ci = (unsigned int)-1;
+	int ret = 0;
+
+	while (ret == 0) {
+		volatile struct mlx5_cqe64 *cqe;
+
+		cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
+		if (cqe == NULL)
+			break;
+		wqe_ci = ntohs(cqe->wqe_counter);
 	}
-	elts_comp -= wcs_n;
-	assert(elts_comp <= txq->elts_comp);
-	/*
-	 * Assume WC status is successful as nothing can be done about it
-	 * anyway.
-	 */
-	elts_tail += wcs_n * txq->elts_comp_cd_init;
-	if (elts_tail >= elts_n)
-		elts_tail -= elts_n;
-
-	while (elts_free != elts_tail) {
-		struct txq_elt *elt = &(*txq->elts)[elts_free];
+	if (unlikely(wqe_ci == (unsigned int)-1))
+		return;
+	/* Free buffers. */
+	elts_tail = (wqe_ci + 1) & (elts_n - 1);
+	do {
+		struct rte_mbuf *elt = (*txq->elts)[elts_free];
 		unsigned int elts_free_next =
-			(((elts_free + 1) == elts_n) ? 0 : elts_free + 1);
-		struct rte_mbuf *tmp = elt->buf;
-		struct txq_elt *elt_next = &(*txq->elts)[elts_free_next];
+			(elts_free + 1) & (elts_n - 1);
+		struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
 
 #ifndef NDEBUG
 		/* Poisoning. */
-		memset(elt, 0x66, sizeof(*elt));
+		memset(&(*txq->elts)[elts_free],
+		       0x66,
+		       sizeof((*txq->elts)[elts_free]));
 #endif
-		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		/* Faster than rte_pktmbuf_free(). */
-		do {
-			struct rte_mbuf *next = NEXT(tmp);
-
-			rte_pktmbuf_free_seg(tmp);
-			tmp = next;
-		} while (tmp != NULL);
+		RTE_MBUF_PREFETCH_TO_FREE(elt_next);
+		/* Only one segment needs to be freed. */
+		rte_pktmbuf_free_seg(elt);
 		elts_free = elts_free_next;
-	}
-
+	} while (elts_free != elts_tail);
+	txq->cq_ci = cq_ci;
 	txq->elts_tail = elts_tail;
-	txq->elts_comp = elts_comp;
-	return 0;
+	/* Update the consumer index. */
+	rte_wmb();
+	*txq->cq_db = htonl(cq_ci);
 }
 
 /**
@@ -231,7 +215,8 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 		}
 		if (txq->mp2mr[i].mp == mp) {
 			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			assert(htonl(txq->mp2mr[i].mr->lkey) ==
+			       txq->mp2mr[i].lkey);
 			lkey = txq->mp2mr[i].lkey;
 			break;
 		}
@@ -242,33 +227,136 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 }
 
 /**
- * Insert VLAN using mbuf headroom space.
- *
- * @param buf
- *   Buffer for VLAN insertion.
+ * Write a regular WQE.
  *
- * @return
- *   0 on success, errno value on failure.
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the WQE to fill.
+ * @param addr
+ *   Buffer data address.
+ * @param length
+ *   Packet length.
+ * @param lkey
+ *   Memory region lkey.
  */
-static inline int
-insert_vlan_sw(struct rte_mbuf *buf)
+static inline void
+mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
+	       uintptr_t addr, uint32_t length, uint32_t lkey)
 {
-	uintptr_t addr;
-	uint32_t vlan;
-	uint16_t head_room_len = rte_pktmbuf_headroom(buf);
+	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+	wqe->wqe.ctrl.data[3] = 0;
+	wqe->inl.eseg.rsvd0 = 0;
+	wqe->inl.eseg.rsvd1 = 0;
+	wqe->inl.eseg.mss = 0;
+	wqe->inl.eseg.rsvd2 = 0;
+	wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+	/* Copy the first 16 bytes into inline header. */
+	rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+		   (uint8_t *)(uintptr_t)addr,
+		   MLX5_ETH_INLINE_HEADER_SIZE);
+	addr += MLX5_ETH_INLINE_HEADER_SIZE;
+	length -= MLX5_ETH_INLINE_HEADER_SIZE;
+	/* Store remaining data in data segment. */
+	wqe->wqe.dseg.byte_count = htonl(length);
+	wqe->wqe.dseg.lkey = lkey;
+	wqe->wqe.dseg.addr = htonll(addr);
+	/* Increment consumer index. */
+	++txq->wqe_ci;
+}
 
-	if (head_room_len < 4)
-		return EINVAL;
+/**
+ * Write a regular WQE with VLAN.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the WQE to fill.
+ * @param addr
+ *   Buffer data address.
+ * @param length
+ *   Packet length.
+ * @param lkey
+ *   Memory region lkey.
+ * @param vlan_tci
+ *   VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+		    uintptr_t addr, uint32_t length, uint32_t lkey,
+		    uint16_t vlan_tci)
+{
+	uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+	wqe->wqe.ctrl.data[3] = 0;
+	wqe->inl.eseg.rsvd0 = 0;
+	wqe->inl.eseg.rsvd1 = 0;
+	wqe->inl.eseg.mss = 0;
+	wqe->inl.eseg.rsvd2 = 0;
+	wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+	/*
+	 * Copy 12 bytes of source & destination MAC address.
+	 * Copy 4 bytes of VLAN.
+	 * Copy 2 bytes of Ether type.
+	 */
+	rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
+		   (uint8_t *)(uintptr_t)addr, 12);
+	rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12),
+		   &vlan, sizeof(vlan));
+	rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16),
+		   (uint8_t *)((uintptr_t)addr + 12), 2);
+	addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+	length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+	/* Store remaining data in data segment. */
+	wqe->wqe.dseg.byte_count = htonl(length);
+	wqe->wqe.dseg.lkey = lkey;
+	wqe->wqe.dseg.addr = htonll(addr);
+	/* Increment consumer index. */
+	++txq->wqe_ci;
+}
 
-	addr = rte_pktmbuf_mtod(buf, uintptr_t);
-	vlan = htonl(0x81000000 | buf->vlan_tci);
-	memmove((void *)(addr - 4), (void *)addr, 12);
-	memcpy((void *)(addr + 8), &vlan, sizeof(vlan));
+/**
+ * Ring TX queue doorbell.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ */
+static inline void
+mlx5_tx_dbrec(struct txq *txq)
+{
+	uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
+	uint32_t data[4] = {
+		htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+		htonl(txq->qp_num_8s),
+		0,
+		0,
+	};
+	rte_wmb();
+	*txq->qp_db = htonl(txq->wqe_ci);
+	/* Ensure ordering between DB record and BF copy. */
+	rte_wmb();
+	rte_mov16(dst, (uint8_t *)data);
+	txq->bf_offset ^= txq->bf_buf_size;
+}
 
-	SET_DATA_OFF(buf, head_room_len - 4);
-	DATA_LEN(buf) += 4;
+/**
+ * Prefetch a CQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param cqe_ci
+ *   CQE consumer index.
+ */
+static inline void
+tx_prefetch_cqe(struct txq *txq, uint16_t ci)
+{
+	volatile struct mlx5_cqe64 *cqe;
 
-	return 0;
+	cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
+	rte_prefetch0(cqe);
 }
 
 /**
@@ -288,18 +376,21 @@ uint16_t
 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct txq *txq = (struct txq *)dpdk_txq;
-	unsigned int elts_head = txq->elts_head;
+	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int elts_comp_cd = txq->elts_comp_cd;
-	unsigned int elts_comp = 0;
 	unsigned int i;
 	unsigned int max;
-	int err;
-	struct rte_mbuf *buf = pkts[0];
+	volatile union mlx5_wqe *wqe;
+	struct rte_mbuf *buf;
 
-	assert(elts_comp_cd != 0);
+	if (unlikely(!pkts_n))
+		return 0;
+	buf = pkts[0];
 	/* Prefetch first packet cacheline. */
+	tx_prefetch_cqe(txq, txq->cq_ci);
+	tx_prefetch_cqe(txq, txq->cq_ci + 1);
 	rte_prefetch0(buf);
+	/* Start processing. */
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
@@ -313,101 +404,53 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > pkts_n)
 		max = pkts_n;
 	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf_next = pkts[i + 1];
-		unsigned int elts_head_next =
-			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-		struct txq_elt *elt = &(*txq->elts)[elts_head];
-		uint32_t send_flags = 0;
-#ifdef HAVE_VERBS_VLAN_INSERTION
-		int insert_vlan = 0;
-#endif /* HAVE_VERBS_VLAN_INSERTION */
+		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t lkey;
-		uintptr_t buf_next_addr;
 
+		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		rte_prefetch0(wqe);
 		if (i + 1 < max)
-			rte_prefetch0(buf_next);
-		/* Request TX completion. */
-		if (unlikely(--elts_comp_cd == 0)) {
-			elts_comp_cd = txq->elts_comp_cd_init;
-			++elts_comp;
-			send_flags |= IBV_EXP_QP_BURST_SIGNALED;
-		}
-		/* Should we enable HW CKSUM offload */
-		if (buf->ol_flags &
-		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
-			send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
-			/* HW does not support checksum offloads at arbitrary
-			 * offsets but automatically recognizes the packet
-			 * type. For inner L3/L4 checksums, only VXLAN (UDP)
-			 * tunnels are currently supported. */
-			if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
-				send_flags |= IBV_EXP_QP_BURST_TUNNEL;
-		}
-		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
-#ifdef HAVE_VERBS_VLAN_INSERTION
-			if (!txq->priv->mps)
-				insert_vlan = 1;
-			else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
-			{
-				err = insert_vlan_sw(buf);
-				if (unlikely(err))
-					goto stop;
-			}
-		}
+			rte_prefetch0(pkts[i + 1]);
 		/* Retrieve buffer information. */
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
 		/* Update element. */
-		elt->buf = buf;
-		if (txq->priv->sriov)
-			rte_prefetch0((volatile void *)
-				      (uintptr_t)addr);
+		(*txq->elts)[elts_head] = buf;
 		/* Prefetch next buffer data. */
-		if (i + 1 < max) {
-			buf_next_addr =
-				rte_pktmbuf_mtod(buf_next, uintptr_t);
-			rte_prefetch0((volatile void *)
-				      (uintptr_t)buf_next_addr);
-		}
+		if (i + 1 < max)
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+						       volatile void *));
 		/* Retrieve Memory Region key for this memory pool. */
 		lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-		if (unlikely(lkey == (uint32_t)-1)) {
-			/* MR does not exist. */
-			DEBUG("%p: unable to get MP <-> MR"
-			      " association", (void *)txq);
-			/* Clean up TX element. */
-			elt->buf = NULL;
-			goto stop;
-		}
-#ifdef HAVE_VERBS_VLAN_INSERTION
-		if (insert_vlan)
-			err = txq->send_pending_vlan
-				(txq->qp,
-				 addr,
-				 length,
-				 lkey,
-				 send_flags,
-				 &buf->vlan_tci);
+		if (buf->ol_flags & PKT_TX_VLAN_PKT)
+			mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey,
+					    buf->vlan_tci);
 		else
-#endif /* HAVE_VERBS_VLAN_INSERTION */
-			err = txq->send_pending
-				(txq->qp,
-				 addr,
-				 length,
-				 lkey,
-				 send_flags);
-		if (unlikely(err))
-			goto stop;
+			mlx5_wqe_write(txq, wqe, addr, length, lkey);
+		/* Request completion if needed. */
+		if (unlikely(--txq->elts_comp == 0)) {
+			wqe->wqe.ctrl.data[2] = htonl(8);
+			txq->elts_comp = txq->elts_comp_cd_init;
+		} else {
+			wqe->wqe.ctrl.data[2] = 0;
+		}
+		/* Should we enable HW CKSUM offload */
+		if (buf->ol_flags &
+		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+			wqe->wqe.eseg.cs_flags =
+				MLX5_ETH_WQE_L3_CSUM |
+				MLX5_ETH_WQE_L4_CSUM;
+		} else {
+			wqe->wqe.eseg.cs_flags = 0;
+		}
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += length;
 #endif
-stop:
 		elts_head = elts_head_next;
-		buf = buf_next;
+		buf = pkts[i + 1];
 	}
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
@@ -417,16 +460,8 @@ stop:
 	txq->stats.opackets += i;
 #endif
 	/* Ring QP doorbell. */
-	err = txq->send_flush(txq->qp);
-	if (unlikely(err)) {
-		/* A nonzero value is not supposed to be returned.
-		 * Nothing can be done about it. */
-		DEBUG("%p: send_flush() failed with error %d",
-		      (void *)txq, err);
-	}
+	mlx5_tx_dbrec(txq);
 	txq->elts_head = elts_head;
-	txq->elts_comp += elts_comp;
-	txq->elts_comp_cd = elts_comp_cd;
 	return i;
 }
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1827123..6b3bb2d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -223,44 +223,40 @@ struct hash_rxq {
 		[MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS];
 };
 
-/* TX element. */
-struct txq_elt {
-	struct rte_mbuf *buf;
-};
-
 /* TX queue descriptor. */
 struct txq {
-	struct priv *priv; /* Back pointer to private data. */
-	int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max);
-	int (*send_pending)();
-#ifdef HAVE_VERBS_VLAN_INSERTION
-	int (*send_pending_vlan)();
-#endif
-	int (*send_flush)(struct ibv_qp *qp);
-	struct ibv_cq *cq; /* Completion Queue. */
-	struct ibv_qp *qp; /* Queue Pair. */
-	struct txq_elt (*elts)[]; /* TX elements. */
-	unsigned int elts_n; /* (*elts)[] length. */
-	unsigned int elts_head; /* Current index in (*elts)[]. */
-	unsigned int elts_tail; /* First element awaiting completion. */
-	unsigned int elts_comp; /* Number of completion requests. */
-	unsigned int elts_comp_cd; /* Countdown for next completion request. */
-	unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+	uint16_t elts_head; /* Current index in (*elts)[]. */
+	uint16_t elts_tail; /* First element awaiting completion. */
+	uint16_t elts_comp_cd_init; /* Initial value for countdown. */
+	uint16_t elts_comp; /* Elements before asking a completion. */
+	uint16_t elts_n; /* (*elts)[] length. */
+	uint16_t cq_ci; /* Consumer index for completion queue. */
+	uint16_t cqe_n; /* Number of CQ elements. */
+	uint16_t wqe_ci; /* Consumer index for work queue. */
+	uint16_t wqe_n; /* Number of WQ elements. */
+	uint16_t bf_offset; /* Blueflame offset. */
+	uint16_t bf_buf_size; /* Blueflame size. */
+	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
+	volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
+	volatile uint32_t *qp_db; /* Work queue doorbell. */
+	volatile uint32_t *cq_db; /* Completion queue doorbell. */
+	volatile void *bf_reg; /* Blueflame register. */
 	struct {
 		const struct rte_mempool *mp; /* Cached Memory Pool. */
 		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* mr->lkey */
+		uint32_t lkey; /* htonl(mr->lkey) */
 	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
+	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+	uint32_t qp_num_8s; /* QP number shifted by 8. */
 } __rte_cache_aligned;
 
 /* TX queue control descriptor. */
 struct txq_ctrl {
-#ifdef HAVE_VERBS_VLAN_INSERTION
-	struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */
-#else
+	struct priv *priv; /* Back pointer to private data. */
+	struct ibv_cq *cq; /* Completion Queue. */
+	struct ibv_qp *qp; /* Queue Pair. */
 	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
-#endif
 	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
 	unsigned int socket; /* CPU socket ID for allocations. */
@@ -294,8 +290,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 /* mlx5_txq.c */
 
 void txq_cleanup(struct txq_ctrl *);
-int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int,
-	      const struct rte_eth_txconf *);
+int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t,
+		   unsigned int, const struct rte_eth_txconf *);
 int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
 			const struct rte_eth_txconf *);
 void mlx5_tx_queue_release(void *);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index ec4488a..26d6168 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -60,6 +60,7 @@
 #endif
 
 #include "mlx5_utils.h"
+#include "mlx5_defs.h"
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
@@ -72,48 +73,22 @@
  *   Pointer to TX queue structure.
  * @param elts_n
  *   Number of elements to allocate.
- *
- * @return
- *   0 on success, errno value on failure.
  */
-static int
+static void
 txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 {
 	unsigned int i;
-	struct txq_elt (*elts)[elts_n] =
-		rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket);
-	int ret = 0;
 
-	if (elts == NULL) {
-		ERROR("%p: can't allocate packets array", (void *)txq_ctrl);
-		ret = ENOMEM;
-		goto error;
-	}
-	for (i = 0; (i != elts_n); ++i) {
-		struct txq_elt *elt = &(*elts)[i];
+	for (i = 0; (i != elts_n); ++i)
+		(*txq_ctrl->txq.elts)[i] = NULL;
+	for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) {
+		volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i];
 
-		elt->buf = NULL;
+		memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
 	}
 	DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
-	txq_ctrl->txq.elts_n = elts_n;
-	txq_ctrl->txq.elts = elts;
 	txq_ctrl->txq.elts_head = 0;
 	txq_ctrl->txq.elts_tail = 0;
-	txq_ctrl->txq.elts_comp = 0;
-	/* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
-	 * at least 4 times per ring. */
-	txq_ctrl->txq.elts_comp_cd_init =
-		((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
-		 MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
-	txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init;
-	assert(ret == 0);
-	return 0;
-error:
-	rte_free(elts);
-
-	DEBUG("%p: failed, freed everything", (void *)txq_ctrl);
-	assert(ret > 0);
-	return ret;
 }
 
 /**
@@ -128,32 +103,26 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
 	unsigned int elts_n = txq_ctrl->txq.elts_n;
 	unsigned int elts_head = txq_ctrl->txq.elts_head;
 	unsigned int elts_tail = txq_ctrl->txq.elts_tail;
-	struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts;
+	struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
 
 	DEBUG("%p: freeing WRs", (void *)txq_ctrl);
-	txq_ctrl->txq.elts_n = 0;
 	txq_ctrl->txq.elts_head = 0;
 	txq_ctrl->txq.elts_tail = 0;
-	txq_ctrl->txq.elts_comp = 0;
-	txq_ctrl->txq.elts_comp_cd = 0;
-	txq_ctrl->txq.elts_comp_cd_init = 0;
-	txq_ctrl->txq.elts = NULL;
 
-	if (elts == NULL)
-		return;
 	while (elts_tail != elts_head) {
-		struct txq_elt *elt = &(*elts)[elts_tail];
+		struct rte_mbuf *elt = (*elts)[elts_tail];
 
-		assert(elt->buf != NULL);
-		rte_pktmbuf_free(elt->buf);
+		assert(elt != NULL);
+		rte_pktmbuf_free(elt);
 #ifndef NDEBUG
 		/* Poisoning. */
-		memset(elt, 0x77, sizeof(*elt));
+		memset(&(*elts)[elts_tail],
+		       0x77,
+		       sizeof((*elts)[elts_tail]));
 #endif
 		if (++elts_tail == elts_n)
 			elts_tail = 0;
 	}
-	rte_free(elts);
 }
 
 /**
@@ -172,42 +141,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
 
 	DEBUG("cleaning up %p", (void *)txq_ctrl);
 	txq_free_elts(txq_ctrl);
-	txq_ctrl->txq.poll_cnt = NULL;
-	txq_ctrl->txq.send_flush = NULL;
 	if (txq_ctrl->if_qp != NULL) {
-		assert(txq_ctrl->txq.priv != NULL);
-		assert(txq_ctrl->txq.priv->ctx != NULL);
-		assert(txq_ctrl->txq.qp != NULL);
+		assert(txq_ctrl->priv != NULL);
+		assert(txq_ctrl->priv->ctx != NULL);
+		assert(txq_ctrl->qp != NULL);
 		params = (struct ibv_exp_release_intf_params){
 			.comp_mask = 0,
 		};
-		claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+		claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
 						txq_ctrl->if_qp,
 						&params));
 	}
 	if (txq_ctrl->if_cq != NULL) {
-		assert(txq_ctrl->txq.priv != NULL);
-		assert(txq_ctrl->txq.priv->ctx != NULL);
-		assert(txq_ctrl->txq.cq != NULL);
+		assert(txq_ctrl->priv != NULL);
+		assert(txq_ctrl->priv->ctx != NULL);
+		assert(txq_ctrl->cq != NULL);
 		params = (struct ibv_exp_release_intf_params){
 			.comp_mask = 0,
 		};
-		claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx,
+		claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx,
 						txq_ctrl->if_cq,
 						&params));
 	}
-	if (txq_ctrl->txq.qp != NULL)
-		claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp));
-	if (txq_ctrl->txq.cq != NULL)
-		claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq));
+	if (txq_ctrl->qp != NULL)
+		claim_zero(ibv_destroy_qp(txq_ctrl->qp));
+	if (txq_ctrl->cq != NULL)
+		claim_zero(ibv_destroy_cq(txq_ctrl->cq));
 	if (txq_ctrl->rd != NULL) {
 		struct ibv_exp_destroy_res_domain_attr attr = {
 			.comp_mask = 0,
 		};
 
-		assert(txq_ctrl->txq.priv != NULL);
-		assert(txq_ctrl->txq.priv->ctx != NULL);
-		claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx,
+		assert(txq_ctrl->priv != NULL);
+		assert(txq_ctrl->priv->ctx != NULL);
+		claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx,
 						      txq_ctrl->rd,
 						      &attr));
 	}
@@ -221,6 +188,49 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
 }
 
 /**
+ * Initialize TX queue.
+ *
+ * @param tmpl
+ *   Pointer to TX queue control template.
+ * @param txq_ctrl
+ *   Pointer to TX queue control.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static inline int
+txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
+{
+	struct mlx5_qp *qp = to_mqp(tmpl->qp);
+	struct ibv_cq *ibcq = tmpl->cq;
+	struct mlx5_cq *cq = to_mxxx(cq, cq);
+
+	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
+		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
+		return EINVAL;
+	}
+	tmpl->txq.cqe_n = ibcq->cqe + 1;
+	tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
+	tmpl->txq.wqes =
+		(volatile union mlx5_wqe (*)[])
+		(uintptr_t)qp->gen_data.sqstart;
+	tmpl->txq.wqe_n = qp->sq.wqe_cnt;
+	tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
+	tmpl->txq.bf_reg = qp->gen_data.bf->reg;
+	tmpl->txq.bf_offset = qp->gen_data.bf->offset;
+	tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size;
+	tmpl->txq.cq_db = cq->dbrec;
+	tmpl->txq.cqes =
+		(volatile struct mlx5_cqe (*)[])
+		(uintptr_t)cq->active_buf->buf;
+	tmpl->txq.elts =
+		(struct rte_mbuf *(*)[tmpl->txq.elts_n])
+		((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
+	return 0;
+}
+
+/**
  * Configure a TX queue.
  *
  * @param dev
@@ -238,15 +248,14 @@ txq_cleanup(struct txq_ctrl *txq_ctrl)
  *   0 on success, errno value on failure.
  */
 int
-txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
-	  unsigned int socket, const struct rte_eth_txconf *conf)
+txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
+	       uint16_t desc, unsigned int socket,
+	       const struct rte_eth_txconf *conf)
 {
 	struct priv *priv = mlx5_get_priv(dev);
 	struct txq_ctrl tmpl = {
+		.priv = priv,
 		.socket = socket,
-		.txq = {
-			.priv = priv,
-		},
 	};
 	union {
 		struct ibv_exp_query_intf_params params;
@@ -254,15 +263,21 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 		struct ibv_exp_res_domain_init_attr rd;
 		struct ibv_exp_cq_init_attr cq;
 		struct ibv_exp_qp_attr mod;
+		struct ibv_exp_cq_attr cq_attr;
 	} attr;
 	enum ibv_exp_query_intf_status status;
 	int ret = 0;
 
 	(void)conf; /* Thresholds configuration (ignored). */
-	if (desc == 0) {
-		ERROR("%p: invalid number of TX descriptors", (void *)dev);
-		return EINVAL;
-	}
+	tmpl.txq.elts_n = desc;
+	/*
+	 * Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
+	 * at least 4 times per ring.
+	 */
+	tmpl.txq.elts_comp_cd_init =
+		((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
+		 MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
+	tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -281,9 +296,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
 		.res_domain = tmpl.rd,
 	};
-	tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0,
-					&attr.cq);
-	if (tmpl.txq.cq == NULL) {
+	tmpl.cq = ibv_exp_create_cq(priv->ctx,
+				    (desc / tmpl.txq.elts_comp_cd_init) - 1,
+				    NULL, NULL, 0, &attr.cq);
+	if (tmpl.cq == NULL) {
 		ret = ENOMEM;
 		ERROR("%p: CQ creation failure: %s",
 		      (void *)dev, strerror(ret));
@@ -295,9 +311,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 	      priv->device_attr.max_sge);
 	attr.init = (struct ibv_exp_qp_init_attr){
 		/* CQ to be associated with the send queue. */
-		.send_cq = tmpl.txq.cq,
+		.send_cq = tmpl.cq,
 		/* CQ to be associated with the receive queue. */
-		.recv_cq = tmpl.txq.cq,
+		.recv_cq = tmpl.cq,
 		.cap = {
 			/* Max number of outstanding WRs. */
 			.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -315,8 +331,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 		.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
 			      IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
 	};
-	tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
-	if (tmpl.txq.qp == NULL) {
+	tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+	if (tmpl.qp == NULL) {
 		ret = (errno ? errno : EINVAL);
 		ERROR("%p: QP creation failure: %s",
 		      (void *)dev, strerror(ret));
@@ -328,30 +344,31 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 		/* Primary port number. */
 		.port_num = priv->port
 	};
-	ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod,
+	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
 				(IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
 	if (ret) {
 		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
-	ret = txq_alloc_elts(&tmpl, desc);
+	ret = txq_setup(&tmpl, txq_ctrl);
 	if (ret) {
-		ERROR("%p: TXQ allocation failed: %s",
+		ERROR("%p: cannot initialize TX queue structure: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
+	txq_alloc_elts(&tmpl, desc);
 	attr.mod = (struct ibv_exp_qp_attr){
 		.qp_state = IBV_QPS_RTR
 	};
-	ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
 	if (ret) {
 		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
 	attr.mod.qp_state = IBV_QPS_RTS;
-	ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE);
+	ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
 	if (ret) {
 		ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
 		      (void *)dev, strerror(ret));
@@ -360,7 +377,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 	attr.params = (struct ibv_exp_query_intf_params){
 		.intf_scope = IBV_EXP_INTF_GLOBAL,
 		.intf = IBV_EXP_INTF_CQ,
-		.obj = tmpl.txq.cq,
+		.obj = tmpl.cq,
 	};
 	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
 	if (tmpl.if_cq == NULL) {
@@ -372,10 +389,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 	attr.params = (struct ibv_exp_query_intf_params){
 		.intf_scope = IBV_EXP_INTF_GLOBAL,
 		.intf = IBV_EXP_INTF_QP_BURST,
-		.obj = tmpl.txq.qp,
-#ifdef HAVE_VERBS_VLAN_INSERTION
 		.intf_version = 1,
-#endif
+		.obj = tmpl.qp,
 		/* Enable multi-packet send if supported. */
 		.family_flags =
 			(priv->mps ?
@@ -393,12 +408,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc,
 	DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
 	txq_cleanup(txq_ctrl);
 	*txq_ctrl = tmpl;
-	txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt;
-	txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending;
-#ifdef HAVE_VERBS_VLAN_INSERTION
-	txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan;
-#endif
-	txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush;
 	DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
 	/* Pre-register known mempools. */
 	rte_mempool_walk(txq_mp2mr_iter, txq_ctrl);
@@ -433,15 +442,19 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 {
 	struct priv *priv = dev->data->dev_private;
 	struct txq *txq = (*priv->txqs)[idx];
-	struct txq_ctrl *txq_ctrl;
+	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
 	int ret;
 
 	if (mlx5_is_secondary())
 		return -E_RTE_SECONDARY;
 
 	priv_lock(priv);
-	if (txq)
-		txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		WARN("%p: increased number of descriptors in TX queue %u"
+		     " to the next power of two (%d)",
+		     (void *)dev, idx, desc);
+	}
 	DEBUG("%p: configuring queue %u for %u descriptors",
 	      (void *)dev, idx, desc);
 	if (idx >= priv->txqs_n) {
@@ -460,8 +473,11 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		(*priv->txqs)[idx] = NULL;
 		txq_cleanup(txq_ctrl);
 	} else {
-		txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl),
-					     0, socket);
+		txq_ctrl =
+			rte_calloc_socket("TXQ", 1,
+					  sizeof(*txq_ctrl) +
+					  desc * sizeof(struct rte_mbuf *),
+					  0, socket);
 		if (txq_ctrl == NULL) {
 			ERROR("%p: unable to allocate queue index %u",
 			      (void *)dev, idx);
@@ -469,7 +485,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			return -ENOMEM;
 		}
 	}
-	ret = txq_setup(dev, txq_ctrl, desc, socket, conf);
+	ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
 	if (ret)
 		rte_free(txq_ctrl);
 	else {
@@ -504,7 +520,7 @@ mlx5_tx_queue_release(void *dpdk_txq)
 	if (txq == NULL)
 		return;
 	txq_ctrl = container_of(txq, struct txq_ctrl, txq);
-	priv = txq->priv;
+	priv = txq_ctrl->priv;
 	priv_lock(priv);
 	for (i = 0; (i != priv->txqs_n); ++i)
 		if ((*priv->txqs)[i] == txq) {
@@ -539,7 +555,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts,
 			      uint16_t pkts_n)
 {
 	struct txq *txq = dpdk_txq;
-	struct priv *priv = mlx5_secondary_data_setup(txq->priv);
+	struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq);
+	struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv);
 	struct priv *primary_priv;
 	unsigned int index;
 
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>,
	Olga Shern <olgas@mellanox.com>,
	Vasily Philipov <vasilyf@mellanox.com>
Subject: [dpdk-dev] [PATCH v5 15/25] mlx5: handle Rx CQE compression
Date: Thu, 23 Jun 2016 19:05:13 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170513.nZkYOjy2MRPaelbRz7OQpANZJtB-KtzPIoXns0EDV_I@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

Mini (compressed) CQEs are returned by the NIC when PCI back pressure is
detected, in which case the first CQE64 contains common packet information
followed by a number of CQE8 providing the rest, followed by a matching
number of empty CQE64 entries to be used by software for decompression.

Before decompression:

      0           1          2           6         7         8
  +-------+  +---------+ +-------+   +-------+ +-------+ +-------+
  | CQE64 |  |  CQE64  | | CQE64 |   | CQE64 | | CQE64 | | CQE64 |
  |-------|  |---------| |-------|   |-------| |-------| |-------|
  | ..... |  | cqe8[0] | |       | . |       | |       | | ..... |
  | ..... |  | cqe8[1] | |       | . |       | |       | | ..... |
  | ..... |  | ....... | |       | . |       | |       | | ..... |
  | ..... |  | cqe8[7] | |       |   |       | |       | | ..... |
  +-------+  +---------+ +-------+   +-------+ +-------+ +-------+

After decompression:

      0          1     ...     8
  +-------+  +-------+     +-------+
  | CQE64 |  | CQE64 |     | CQE64 |
  |-------|  |-------|     |-------|
  | ..... |  | ..... |  .  | ..... |
  | ..... |  | ..... |  .  | ..... |
  | ..... |  | ..... |  .  | ..... |
  | ..... |  | ..... |     | ..... |
  +-------+  +-------+     +-------+

This patch does not perform the entire decompression step as it would be
really expensive, instead the first CQE64 is consumed and an internal
context is maintained to interpret the following CQE8 entries directly.

Intermediate empty CQE64 entries are handed back to HW without further
processing.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Olga Shern <olgas@mellanox.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 doc/guides/nics/mlx5.rst     |   6 +
 drivers/net/mlx5/mlx5.c      |  25 +++-
 drivers/net/mlx5/mlx5.h      |   1 +
 drivers/net/mlx5/mlx5_rxq.c  |   9 +-
 drivers/net/mlx5/mlx5_rxtx.c | 265 +++++++++++++++++++++++++++++++++----------
 drivers/net/mlx5/mlx5_rxtx.h |  11 ++
 drivers/net/mlx5/mlx5_txq.c  |   5 +
 7 files changed, 253 insertions(+), 69 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 3a07928..756153b 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -148,6 +148,12 @@ Run-time configuration
 
 - **ethtool** operations on related kernel interfaces also affect the PMD.
 
+- ``rxq_cqe_comp_en`` parameter [int]
+
+  A nonzero value enables the compression of CQE on RX side. This feature
+  allows to save PCI bandwidth and improve performance at the cost of a
+  slightly higher CPU usage.  Enabled by default.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 6027393..0257d34 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -69,6 +69,9 @@
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
 
+/* Device parameter to enable RX completion queue compression. */
+#define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
+
 /**
  * Retrieve integer value from environment variable.
  *
@@ -256,12 +259,21 @@ static int
 mlx5_args_check(const char *key, const char *val, void *opaque)
 {
 	struct priv *priv = opaque;
+	unsigned long tmp;
 
-	/* No parameters are expected at the moment. */
-	(void)priv;
-	(void)val;
-	WARN("%s: unknown parameter", key);
-	return -EINVAL;
+	errno = 0;
+	tmp = strtoul(val, NULL, 0);
+	if (errno) {
+		WARN("%s: \"%s\" is not a valid integer", key, val);
+		return errno;
+	}
+	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+		priv->cqe_comp = !!tmp;
+	} else {
+		WARN("%s: unknown parameter", key);
+		return -EINVAL;
+	}
+	return 0;
 }
 
 /**
@@ -279,7 +291,7 @@ static int
 mlx5_args(struct priv *priv, struct rte_devargs *devargs)
 {
 	const char **params = (const char *[]){
-		NULL,
+		MLX5_RXQ_CQE_COMP_EN,
 	};
 	struct rte_kvargs *kvlist;
 	int ret = 0;
@@ -475,6 +487,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
+		priv->cqe_comp = 1; /* Enable compression by default. */
 		err = mlx5_args(priv, pci_dev->devargs);
 		if (err) {
 			ERROR("failed to process device arguments: %s",
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3dca03d..8f5a6df 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -111,6 +111,7 @@ struct priv {
 	unsigned int hw_padding:1; /* End alignment padding is supported. */
 	unsigned int sriov:1; /* This is a VF or PF with VF devices. */
 	unsigned int mps:1; /* Whether multi-packet send is supported. */
+	unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
 	unsigned int pending_alarm:1; /* An alarm is pending. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b1d6cfe..b2f8f9a 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -898,6 +898,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
 		return EINVAL;
 	}
 	tmpl->rxq.rq_db = rwq->rq.db;
+	tmpl->rxq.cqe_n = ibcq->cqe + 1;
 	tmpl->rxq.cq_ci = 0;
 	tmpl->rxq.rq_ci = 0;
 	tmpl->rxq.cq_db = cq->dbrec;
@@ -956,6 +957,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	} attr;
 	enum ibv_exp_query_intf_status status;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+	unsigned int cqe_n = desc - 1;
 	int ret = 0;
 
 	(void)conf; /* Thresholds configuration (ignored). */
@@ -996,7 +998,12 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
 		.res_domain = tmpl.rd,
 	};
-	tmpl.cq = ibv_exp_create_cq(priv->ctx, desc - 1, NULL, NULL, 0,
+	if (priv->cqe_comp) {
+		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+		cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
+	}
+	tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
 				    &attr.cq);
 	if (tmpl.cq == NULL) {
 		ret = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2372fce..43236f5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -69,44 +69,87 @@
 #include "mlx5_defs.h"
 #include "mlx5_prm.h"
 
-static inline volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
-	  unsigned int cqes_n, uint16_t *ci)
-	  __attribute__((always_inline));
+#ifndef NDEBUG
 
+/**
+ * Verify or set magic value in CQE.
+ *
+ * @param cqe
+ *   Pointer to CQE.
+ *
+ * @return
+ *   0 the first time.
+ */
 static inline int
-rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
-
-static volatile struct mlx5_cqe64 *
-get_cqe64(volatile struct mlx5_cqe cqes[],
-	  unsigned int cqes_n, uint16_t *ci)
+check_cqe64_seen(volatile struct mlx5_cqe64 *cqe)
 {
-	volatile struct mlx5_cqe64 *cqe;
-	uint16_t idx = *ci;
-	uint8_t op_own;
-
-	cqe = &cqes[idx & (cqes_n - 1)].cqe64;
-	op_own = cqe->op_own;
-	if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
-		return NULL;
-	} else if (unlikely(op_own & 0x80)) {
-		switch (op_own >> 4) {
-		case MLX5_CQE_INVALID:
-			return NULL; /* No CQE */
-		case MLX5_CQE_REQ_ERR:
-			return cqe;
-		case MLX5_CQE_RESP_ERR:
-			++(*ci);
-			return NULL;
-		default:
-			return NULL;
+	static const uint8_t magic[] = "seen";
+	volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40;
+	int ret = 1;
+	unsigned int i;
+
+	for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
+		if (!ret || (*buf)[i] != magic[i]) {
+			ret = 0;
+			(*buf)[i] = magic[i];
 		}
+	return ret;
+}
+
+#endif /* NDEBUG */
+
+static inline int
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+	    unsigned int cqes_n, const uint16_t ci)
+	    __attribute__((always_inline));
+
+/**
+ * Check whether CQE is valid.
+ *
+ * @param cqe
+ *   Pointer to CQE.
+ * @param cqes_n
+ *   Size of completion queue.
+ * @param ci
+ *   Consumer index.
+ *
+ * @return
+ *   0 on success, 1 on failure.
+ */
+static inline int
+check_cqe64(volatile struct mlx5_cqe64 *cqe,
+		unsigned int cqes_n, const uint16_t ci)
+{
+	uint16_t idx = ci & cqes_n;
+	uint8_t op_own = cqe->op_own;
+	uint8_t op_owner = MLX5_CQE_OWNER(op_own);
+	uint8_t op_code = MLX5_CQE_OPCODE(op_own);
+
+	if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
+		return 1; /* No CQE. */
+#ifndef NDEBUG
+	if ((op_code == MLX5_CQE_RESP_ERR) ||
+	    (op_code == MLX5_CQE_REQ_ERR)) {
+		volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
+		uint8_t syndrome = err_cqe->syndrome;
+
+		if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
+		    (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
+			return 0;
+		if (!check_cqe64_seen(cqe))
+			ERROR("unexpected CQE error %u (0x%02x)"
+			      " syndrome 0x%02x",
+			      op_code, op_code, syndrome);
+		return 1;
+	} else if ((op_code != MLX5_CQE_RESP_SEND) &&
+		   (op_code != MLX5_CQE_REQ)) {
+		if (!check_cqe64_seen(cqe))
+			ERROR("unexpected CQE opcode %u (0x%02x)",
+			      op_code, op_code);
+		return 1;
 	}
-	if (cqe) {
-		*ci = idx + 1;
-		return cqe;
-	}
-	return NULL;
+#endif /* NDEBUG */
+	return 0;
 }
 
 /**
@@ -125,20 +168,34 @@ txq_complete(struct txq *txq)
 {
 	const unsigned int elts_n = txq->elts_n;
 	const unsigned int cqe_n = txq->cqe_n;
+	const unsigned int cqe_cnt = cqe_n - 1;
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
 	uint16_t cq_ci = txq->cq_ci;
 	unsigned int wqe_ci = (unsigned int)-1;
-	int ret = 0;
 
-	while (ret == 0) {
-		volatile struct mlx5_cqe64 *cqe;
+	do {
+		unsigned int idx = cq_ci & cqe_cnt;
+		volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;
 
-		cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci);
-		if (cqe == NULL)
+		if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
 			break;
+#ifndef NDEBUG
+		if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
+			if (!check_cqe64_seen(cqe))
+				ERROR("unexpected compressed CQE, TX stopped");
+			return;
+		}
+		if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
+		    (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
+			if (!check_cqe64_seen(cqe))
+				ERROR("unexpected error CQE, TX stopped");
+			return;
+		}
+#endif /* NDEBUG */
 		wqe_ci = ntohs(cqe->wqe_counter);
-	}
+		++cq_ci;
+	} while (1);
 	if (unlikely(wqe_ci == (unsigned int)-1))
 		return;
 	/* Free buffers. */
@@ -509,6 +566,100 @@ rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
 }
 
 /**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ *   Pointer to RX queue.
+ * @param cqe
+ *   CQE to process.
+ *
+ * @return
+ *   Packet size in bytes (0 if there is none), -1 in case of completion
+ *   with error.
+ */
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe,
+		 uint16_t cqe_cnt)
+{
+	struct rxq_zip *zip = &rxq->zip;
+	uint16_t cqe_n = cqe_cnt + 1;
+	int len = 0;
+
+	/* Process compressed data in the CQE and mini arrays. */
+	if (zip->ai) {
+		volatile struct mlx5_mini_cqe8 (*mc)[8] =
+			(volatile struct mlx5_mini_cqe8 (*)[8])
+			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64);
+
+		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
+		if ((++zip->ai & 7) == 0) {
+			/*
+			 * Increment consumer index to skip the number of
+			 * CQEs consumed. Hardware leaves holes in the CQ
+			 * ring for software use.
+			 */
+			zip->ca = zip->na;
+			zip->na += 8;
+		}
+		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+			uint16_t idx = rxq->cq_ci;
+			uint16_t end = zip->cq_ci;
+
+			while (idx != end) {
+				(*rxq->cqes)[idx & cqe_cnt].cqe64.op_own =
+					MLX5_CQE_INVALIDATE;
+				++idx;
+			}
+			rxq->cq_ci = zip->cq_ci;
+			zip->ai = 0;
+		}
+	/* No compressed data, get next CQE and verify if it is compressed. */
+	} else {
+		int ret;
+		int8_t op_own;
+
+		ret = check_cqe64(cqe, cqe_n, rxq->cq_ci);
+		if (unlikely(ret == 1))
+			return 0;
+		++rxq->cq_ci;
+		op_own = cqe->op_own;
+		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+			volatile struct mlx5_mini_cqe8 (*mc)[8] =
+				(volatile struct mlx5_mini_cqe8 (*)[8])
+				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
+							  cqe_cnt].cqe64);
+
+			/* Fix endianness. */
+			zip->cqe_cnt = ntohl(cqe->byte_cnt);
+			/*
+			 * Current mini array position is the one returned by
+			 * check_cqe64().
+			 *
+			 * If completion comprises several mini arrays, as a
+			 * special case the second one is located 7 CQEs after
+			 * the initial CQE instead of 8 for subsequent ones.
+			 */
+			zip->ca = rxq->cq_ci & cqe_cnt;
+			zip->na = zip->ca + 7;
+			/* Compute the next non compressed CQE. */
+			--rxq->cq_ci;
+			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+			/* Get packet size to return. */
+			len = ntohl((*mc)[0].byte_cnt);
+			zip->ai = 1;
+		} else {
+			len = ntohl(cqe->byte_cnt);
+		}
+		/* Error while receiving packet. */
+		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
+			return -1;
+	}
+	return len;
+}
+
+/**
  * Translate RX completion flags to offload flags.
  *
  * @param[in] rxq
@@ -556,26 +707,6 @@ rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
 }
 
 /**
- * Get size of the next packet.
- *
- * @param rxq
- *   RX queue to fetch packet from.
- *
- * @return
- *   Packet size in bytes.
- */
-static inline int __attribute__((always_inline))
-rx_poll_len(struct rxq *rxq)
-{
-	volatile struct mlx5_cqe64 *cqe;
-
-	cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
-	if (cqe)
-		return ntohl(cqe->byte_cnt);
-	return 0;
-}
-
-/**
  * DPDK callback for RX.
  *
  * @param dpdk_rxq
@@ -597,15 +728,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int rq_ci = rxq->rq_ci;
 	const unsigned int elts_n = rxq->elts_n;
 	const unsigned int wqe_cnt = elts_n - 1;
+	const unsigned int cqe_cnt = rxq->cqe_n - 1;
 
 	for (i = 0; (i != pkts_n); ++i) {
 		unsigned int idx = rq_ci & wqe_cnt;
+		int len;
 		struct rte_mbuf *rep;
 		struct rte_mbuf *pkt;
-		unsigned int len;
 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
 		volatile struct mlx5_cqe64 *cqe =
-			&(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
+			&(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
 
 		pkt = (*rxq->elts)[idx];
 		rte_prefetch0(cqe);
@@ -618,12 +750,20 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		NB_SEGS(rep) = 1;
 		PORT(rep) = rxq->port_id;
 		NEXT(rep) = NULL;
-		len = rx_poll_len(rxq);
+		len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
 		if (unlikely(len == 0)) {
 			rte_mbuf_refcnt_set(rep, 0);
 			__rte_mbuf_raw_free(rep);
 			break;
 		}
+		if (unlikely(len == -1)) {
+			/* RX error, packet is likely too large. */
+			rte_mbuf_refcnt_set(rep, 0);
+			__rte_mbuf_raw_free(rep);
+			++rxq->stats.idropped;
+			--i;
+			goto skip;
+		}
 		/*
 		 * Fill NIC descriptor with the new buffer.  The lkey and size
 		 * of the buffers are already known, only the buffer address
@@ -655,6 +795,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Return packet. */
 		*(pkts++) = pkt;
 		++pkts_ret;
+skip:
 		++rq_ci;
 	}
 	if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 6b3bb2d..77b0fde 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -91,6 +91,15 @@ struct fdir_queue {
 
 struct priv;
 
+/* Compressed CQE context. */
+struct rxq_zip {
+	uint16_t ai; /* Array index. */
+	uint16_t ca; /* Current array index. */
+	uint16_t na; /* Next array index. */
+	uint16_t cq_ci; /* The next CQE. */
+	uint32_t cqe_cnt; /* Number of CQEs. */
+};
+
 /* RX queue descriptor. */
 struct rxq {
 	unsigned int csum:1; /* Enable checksum offloading. */
@@ -100,9 +109,11 @@ struct rxq {
 	uint16_t rq_ci;
 	uint16_t cq_ci;
 	uint16_t elts_n;
+	uint16_t cqe_n; /* Number of CQ elements. */
 	uint16_t port_id;
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
+	struct rxq_zip zip; /* Compressed context. */
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	struct rte_mbuf *(*elts)[];
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 26d6168..22e9bae 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -268,6 +268,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 	enum ibv_exp_query_intf_status status;
 	int ret = 0;
 
+	if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+		ret = ENOTSUP;
+		ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
+		goto error;
+	}
 	(void)conf; /* Thresholds configuration (ignored). */
 	tmpl.txq.elts_n = desc;
 	/*
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>,
	Vasily Philipov <vasilyf@mellanox.com>
Subject: [dpdk-dev] [PATCH v5 16/25] mlx5: replace countdown with threshold for Tx completions
Date: Thu, 23 Jun 2016 19:05:14 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170514.C9cVXT2Pvxm9dIZQIF8-ESpyReHetnfmS0_zHeNMF7A@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

Replacing the variable countdown (which depends on the number of
descriptors) with a fixed relative threshold known at compile time improves
performance by reducing the TX queue structure footprint and the amount of
code to manage completions during a burst.

Completions are now requested at most once per burst after threshold is
reached.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx5/mlx5_defs.h |  7 +++++--
 drivers/net/mlx5/mlx5_rxtx.c | 44 +++++++++++++++++++++++++-------------------
 drivers/net/mlx5/mlx5_rxtx.h |  5 ++---
 drivers/net/mlx5/mlx5_txq.c  | 21 ++++++++++++---------
 4 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 8d2ec7a..cc2a6f3 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -48,8 +48,11 @@
 /* Maximum number of special flows. */
 #define MLX5_MAX_SPECIAL_FLOWS 4
 
-/* Request send completion once in every 64 sends, might be less. */
-#define MLX5_PMD_TX_PER_COMP_REQ 64
+/*
+ * Request TX completion every time descriptors reach this threshold since
+ * the previous request. Must be a power of two for performance reasons.
+ */
+#define MLX5_TX_COMP_THRESH 32
 
 /* RSS Indirection table size. */
 #define RSS_INDIRECTION_TABLE_SIZE 256
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 43236f5..9d992c3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -156,9 +156,6 @@ check_cqe64(volatile struct mlx5_cqe64 *cqe,
  * Manage TX completions.
  *
  * When sending a burst, mlx5_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
  *
  * @param txq
  *   Pointer to TX queue structure.
@@ -172,14 +169,16 @@ txq_complete(struct txq *txq)
 	uint16_t elts_free = txq->elts_tail;
 	uint16_t elts_tail;
 	uint16_t cq_ci = txq->cq_ci;
-	unsigned int wqe_ci = (unsigned int)-1;
+	volatile struct mlx5_cqe64 *cqe = NULL;
+	volatile union mlx5_wqe *wqe;
 
 	do {
-		unsigned int idx = cq_ci & cqe_cnt;
-		volatile struct mlx5_cqe64 *cqe = &(*txq->cqes)[idx].cqe64;
+		volatile struct mlx5_cqe64 *tmp;
 
-		if (check_cqe64(cqe, cqe_n, cq_ci) == 1)
+		tmp = &(*txq->cqes)[cq_ci & cqe_cnt].cqe64;
+		if (check_cqe64(tmp, cqe_n, cq_ci))
 			break;
+		cqe = tmp;
 #ifndef NDEBUG
 		if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
 			if (!check_cqe64_seen(cqe))
@@ -193,14 +192,15 @@ txq_complete(struct txq *txq)
 			return;
 		}
 #endif /* NDEBUG */
-		wqe_ci = ntohs(cqe->wqe_counter);
 		++cq_ci;
 	} while (1);
-	if (unlikely(wqe_ci == (unsigned int)-1))
+	if (unlikely(cqe == NULL))
 		return;
+	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
+	elts_tail = wqe->wqe.ctrl.data[3];
+	assert(elts_tail < txq->wqe_n);
 	/* Free buffers. */
-	elts_tail = (wqe_ci + 1) & (elts_n - 1);
-	do {
+	while (elts_free != elts_tail) {
 		struct rte_mbuf *elt = (*txq->elts)[elts_free];
 		unsigned int elts_free_next =
 			(elts_free + 1) & (elts_n - 1);
@@ -216,7 +216,7 @@ txq_complete(struct txq *txq)
 		/* Only one segment needs to be freed. */
 		rte_pktmbuf_free_seg(elt);
 		elts_free = elts_free_next;
-	} while (elts_free != elts_tail);
+	}
 	txq->cq_ci = cq_ci;
 	txq->elts_tail = elts_tail;
 	/* Update the consumer index. */
@@ -437,6 +437,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	const unsigned int elts_n = txq->elts_n;
 	unsigned int i;
 	unsigned int max;
+	unsigned int comp;
 	volatile union mlx5_wqe *wqe;
 	struct rte_mbuf *buf;
 
@@ -486,13 +487,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					    buf->vlan_tci);
 		else
 			mlx5_wqe_write(txq, wqe, addr, length, lkey);
-		/* Request completion if needed. */
-		if (unlikely(--txq->elts_comp == 0)) {
-			wqe->wqe.ctrl.data[2] = htonl(8);
-			txq->elts_comp = txq->elts_comp_cd_init;
-		} else {
-			wqe->wqe.ctrl.data[2] = 0;
-		}
+		wqe->wqe.ctrl.data[2] = 0;
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -512,6 +507,17 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
+	/* Check whether completion threshold has been reached. */
+	comp = txq->elts_comp + i;
+	if (comp >= MLX5_TX_COMP_THRESH) {
+		/* Request completion on last WQE. */
+		wqe->wqe.ctrl.data[2] = htonl(8);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		wqe->wqe.ctrl.data[3] = elts_head;
+		txq->elts_comp = 0;
+	} else {
+		txq->elts_comp = comp;
+	}
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	/* Increment sent packets counter. */
 	txq->stats.opackets += i;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 77b0fde..f900e65 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -238,8 +238,7 @@ struct hash_rxq {
 struct txq {
 	uint16_t elts_head; /* Current index in (*elts)[]. */
 	uint16_t elts_tail; /* First element awaiting completion. */
-	uint16_t elts_comp_cd_init; /* Initial value for countdown. */
-	uint16_t elts_comp; /* Elements before asking a completion. */
+	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t elts_n; /* (*elts)[] length. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t cqe_n; /* Number of CQ elements. */
@@ -247,6 +246,7 @@ struct txq {
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
+	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
 	volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
@@ -259,7 +259,6 @@ struct txq {
 	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
-	uint32_t qp_num_8s; /* QP number shifted by 8. */
 } __rte_cache_aligned;
 
 /* TX queue control descriptor. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 22e9bae..7b2dc7c 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -89,6 +89,7 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
 	DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
 	txq_ctrl->txq.elts_head = 0;
 	txq_ctrl->txq.elts_tail = 0;
+	txq_ctrl->txq.elts_comp = 0;
 }
 
 /**
@@ -108,6 +109,7 @@ txq_free_elts(struct txq_ctrl *txq_ctrl)
 	DEBUG("%p: freeing WRs", (void *)txq_ctrl);
 	txq_ctrl->txq.elts_head = 0;
 	txq_ctrl->txq.elts_tail = 0;
+	txq_ctrl->txq.elts_comp = 0;
 
 	while (elts_tail != elts_head) {
 		struct rte_mbuf *elt = (*elts)[elts_tail];
@@ -274,15 +276,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		goto error;
 	}
 	(void)conf; /* Thresholds configuration (ignored). */
+	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl.txq.elts_n = desc;
-	/*
-	 * Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
-	 * at least 4 times per ring.
-	 */
-	tmpl.txq.elts_comp_cd_init =
-		((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ?
-		 MLX5_PMD_TX_PER_COMP_REQ : (desc / 4));
-	tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init;
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -302,7 +297,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		.res_domain = tmpl.rd,
 	};
 	tmpl.cq = ibv_exp_create_cq(priv->ctx,
-				    (desc / tmpl.txq.elts_comp_cd_init) - 1,
+				    (((desc / MLX5_TX_COMP_THRESH) - 1) ?
+				     ((desc / MLX5_TX_COMP_THRESH) - 1) : 1),
 				    NULL, NULL, 0, &attr.cq);
 	if (tmpl.cq == NULL) {
 		ret = ENOMEM;
@@ -454,6 +450,13 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		return -E_RTE_SECONDARY;
 
 	priv_lock(priv);
+	if (desc <= MLX5_TX_COMP_THRESH) {
+		WARN("%p: number of descriptors requested for TX queue %u"
+		     " must be higher than MLX5_TX_COMP_THRESH, using"
+		     " %u instead of %u",
+		     (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
+		desc = MLX5_TX_COMP_THRESH + 1;
+	}
 	if (!rte_is_power_of_2(desc)) {
 		desc = 1 << log2above(desc);
 		WARN("%p: increased number of descriptors in TX queue %u"
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Yaacov Hazan <yaacovh@mellanox.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 17/25] mlx5: add support for inline send
Date: Thu, 23 Jun 2016 19:05:15 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170515.zMtKtgtXgMhwIHy0wUVsvjw1SXXQ8E57yK5xeACrzTQ@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Yaacov Hazan <yaacovh@mellanox.com>

Implement send inline feature which copies packet data directly into WQEs
for improved latency. The maximum packet size and the minimum number of Tx
queues to qualify for inline send are user-configurable.

This feature is effective when HW causes a performance bottleneck.

Signed-off-by: Yaacov Hazan <yaacovh@mellanox.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 doc/guides/nics/mlx5.rst       |  17 +++
 drivers/net/mlx5/mlx5.c        |  15 +++
 drivers/net/mlx5/mlx5.h        |   2 +
 drivers/net/mlx5/mlx5_ethdev.c |   5 +
 drivers/net/mlx5/mlx5_rxtx.c   | 273 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h   |   2 +
 drivers/net/mlx5/mlx5_txq.c    |   4 +
 7 files changed, 318 insertions(+)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 756153b..9ada221 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -154,6 +154,23 @@ Run-time configuration
   allows to save PCI bandwidth and improve performance at the cost of a
   slightly higher CPU usage.  Enabled by default.
 
+- ``txq_inline`` parameter [int]
+
+  Amount of data to be inlined during TX operations. Improves latency.
+  Can improve PPS performance when PCI back pressure is detected and may be
+  useful for scenarios involving heavy traffic on many queues.
+
+  It is not enabled by default (set to 0) since the additional software
+  logic necessary to handle this mode can lower performance when back
+  pressure is not expected.
+
+- ``txqs_min_inline`` parameter [int]
+
+  Enable inline send only when the number of TX queues is greater or equal
+  to this value.
+
+  This option should be used in combination with ``txq_inline`` above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 0257d34..2d63a48 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -72,6 +72,15 @@
 /* Device parameter to enable RX completion queue compression. */
 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
 
+/* Device parameter to configure inline send. */
+#define MLX5_TXQ_INLINE "txq_inline"
+
+/*
+ * Device parameter to configure the number of TX queues threshold for
+ * enabling inline send.
+ */
+#define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
+
 /**
  * Retrieve integer value from environment variable.
  *
@@ -269,6 +278,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	}
 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
 		priv->cqe_comp = !!tmp;
+	} else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
+		priv->txq_inline = tmp;
+	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
+		priv->txqs_inline = tmp;
 	} else {
 		WARN("%s: unknown parameter", key);
 		return -EINVAL;
@@ -292,6 +305,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
 {
 	const char **params = (const char *[]){
 		MLX5_RXQ_CQE_COMP_EN,
+		MLX5_TXQ_INLINE,
+		MLX5_TXQS_MIN_INLINE,
 	};
 	struct rte_kvargs *kvlist;
 	int ret = 0;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8f5a6df..3a86609 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -113,6 +113,8 @@ struct priv {
 	unsigned int mps:1; /* Whether multi-packet send is supported. */
 	unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
 	unsigned int pending_alarm:1; /* An alarm is pending. */
+	unsigned int txq_inline; /* Maximum packet size for inlining. */
+	unsigned int txqs_inline; /* Queue number threshold for inlining. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 47e64b2..aeea4ff 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1318,6 +1318,11 @@ void
 priv_select_tx_function(struct priv *priv)
 {
 	priv->dev->tx_pkt_burst = mlx5_tx_burst;
+	if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+		priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
+		DEBUG("selected inline TX function (%u >= %u queues)",
+		      priv->txqs_n, priv->txqs_inline);
+	}
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 9d992c3..daa22d9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -376,6 +376,139 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
 }
 
 /**
+ * Write a inline WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the WQE to fill.
+ * @param addr
+ *   Buffer data address.
+ * @param length
+ *   Packet length.
+ * @param lkey
+ *   Memory region lkey.
+ */
+static inline void
+mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
+		      uintptr_t addr, uint32_t length)
+{
+	uint32_t size;
+	uint16_t wqe_cnt = txq->wqe_n - 1;
+	uint16_t wqe_ci = txq->wqe_ci + 1;
+
+	/* Copy the first 16 bytes into inline header. */
+	rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+		   (void *)(uintptr_t)addr,
+		   MLX5_ETH_INLINE_HEADER_SIZE);
+	addr += MLX5_ETH_INLINE_HEADER_SIZE;
+	length -= MLX5_ETH_INLINE_HEADER_SIZE;
+	size = 3 + ((4 + length + 15) / 16);
+	wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+	rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+		   (void *)addr, MLX5_WQE64_INL_DATA);
+	addr += MLX5_WQE64_INL_DATA;
+	length -= MLX5_WQE64_INL_DATA;
+	while (length) {
+		volatile union mlx5_wqe *wqe_next =
+			&(*txq->wqes)[wqe_ci & wqe_cnt];
+		uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+				      sizeof(*wqe) :
+				      length;
+
+		rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+			  (uint8_t *)addr);
+		addr += copy_bytes;
+		length -= copy_bytes;
+		++wqe_ci;
+	}
+	assert(size < 64);
+	wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+	wqe->inl.ctrl.data[3] = 0;
+	wqe->inl.eseg.rsvd0 = 0;
+	wqe->inl.eseg.rsvd1 = 0;
+	wqe->inl.eseg.mss = 0;
+	wqe->inl.eseg.rsvd2 = 0;
+	wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
+	/* Increment consumer index. */
+	txq->wqe_ci = wqe_ci;
+}
+
+/**
+ * Write a inline WQE with VLAN.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param wqe
+ *   Pointer to the WQE to fill.
+ * @param addr
+ *   Buffer data address.
+ * @param length
+ *   Packet length.
+ * @param lkey
+ *   Memory region lkey.
+ * @param vlan_tci
+ *   VLAN field to insert in packet.
+ */
+static inline void
+mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
+			   uintptr_t addr, uint32_t length, uint16_t vlan_tci)
+{
+	uint32_t size;
+	uint32_t wqe_cnt = txq->wqe_n - 1;
+	uint16_t wqe_ci = txq->wqe_ci + 1;
+	uint32_t vlan = htonl(0x81000000 | vlan_tci);
+
+	/*
+	 * Copy 12 bytes of source & destination MAC address.
+	 * Copy 4 bytes of VLAN.
+	 * Copy 2 bytes of Ether type.
+	 */
+	rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
+		   (uint8_t *)addr, 12);
+	rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12,
+		   &vlan, sizeof(vlan));
+	rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 16,
+		   ((uint8_t *)addr + 12), 2);
+	addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+	length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
+	size = (sizeof(wqe->inl.ctrl.ctrl) +
+		sizeof(wqe->inl.eseg) +
+		sizeof(wqe->inl.byte_cnt) +
+		length + 15) / 16;
+	wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
+	rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
+		   (void *)addr, MLX5_WQE64_INL_DATA);
+	addr += MLX5_WQE64_INL_DATA;
+	length -= MLX5_WQE64_INL_DATA;
+	while (length) {
+		volatile union mlx5_wqe *wqe_next =
+			&(*txq->wqes)[wqe_ci & wqe_cnt];
+		uint32_t copy_bytes = (length > sizeof(*wqe)) ?
+				      sizeof(*wqe) :
+				      length;
+
+		rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
+			  (uint8_t *)addr);
+		addr += copy_bytes;
+		length -= copy_bytes;
+		++wqe_ci;
+	}
+	assert(size < 64);
+	wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
+	wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+	wqe->inl.ctrl.data[3] = 0;
+	wqe->inl.eseg.rsvd0 = 0;
+	wqe->inl.eseg.rsvd1 = 0;
+	wqe->inl.eseg.mss = 0;
+	wqe->inl.eseg.rsvd2 = 0;
+	wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
+	/* Increment consumer index. */
+	txq->wqe_ci = wqe_ci;
+}
+
+/**
  * Ring TX queue doorbell.
  *
  * @param txq
@@ -417,6 +550,23 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
 }
 
 /**
+ * Prefetch a WQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param  wqe_ci
+ *   WQE consumer index.
+ */
+static inline void
+tx_prefetch_wqe(struct txq *txq, uint16_t ci)
+{
+	volatile union mlx5_wqe *wqe;
+
+	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
+	rte_prefetch0(wqe);
+}
+
+/**
  * DPDK callback for TX.
  *
  * @param dpdk_txq
@@ -529,6 +679,129 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 }
 
 /**
+ * DPDK callback for TX with inline support.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	uint16_t elts_head = txq->elts_head;
+	const unsigned int elts_n = txq->elts_n;
+	unsigned int i;
+	unsigned int max;
+	unsigned int comp;
+	volatile union mlx5_wqe *wqe;
+	struct rte_mbuf *buf;
+	unsigned int max_inline = txq->max_inline;
+
+	if (unlikely(!pkts_n))
+		return 0;
+	buf = pkts[0];
+	/* Prefetch first packet cacheline. */
+	tx_prefetch_cqe(txq, txq->cq_ci);
+	tx_prefetch_cqe(txq, txq->cq_ci + 1);
+	rte_prefetch0(buf);
+	/* Start processing. */
+	txq_complete(txq);
+	max = (elts_n - (elts_head - txq->elts_tail));
+	if (max > elts_n)
+		max -= elts_n;
+	assert(max >= 1);
+	assert(max <= elts_n);
+	/* Always leave one free entry in the ring. */
+	--max;
+	if (max == 0)
+		return 0;
+	if (max > pkts_n)
+		max = pkts_n;
+	for (i = 0; (i != max); ++i) {
+		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+		uintptr_t addr;
+		uint32_t length;
+		uint32_t lkey;
+
+		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		tx_prefetch_wqe(txq, txq->wqe_ci);
+		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+		if (i + 1 < max)
+			rte_prefetch0(pkts[i + 1]);
+		/* Should we enable HW CKSUM offload */
+		if (buf->ol_flags &
+		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
+			wqe->inl.eseg.cs_flags =
+				MLX5_ETH_WQE_L3_CSUM |
+				MLX5_ETH_WQE_L4_CSUM;
+		} else {
+			wqe->inl.eseg.cs_flags = 0;
+		}
+		/* Retrieve buffer information. */
+		addr = rte_pktmbuf_mtod(buf, uintptr_t);
+		length = DATA_LEN(buf);
+		/* Update element. */
+		(*txq->elts)[elts_head] = buf;
+		/* Prefetch next buffer data. */
+		if (i + 1 < max)
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+						       volatile void *));
+		if (length <= max_inline) {
+			if (buf->ol_flags & PKT_TX_VLAN_PKT)
+				mlx5_wqe_write_inline_vlan(txq, wqe,
+							   addr, length,
+							   buf->vlan_tci);
+			else
+				mlx5_wqe_write_inline(txq, wqe, addr, length);
+		} else {
+			/* Retrieve Memory Region key for this memory pool. */
+			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+			if (buf->ol_flags & PKT_TX_VLAN_PKT)
+				mlx5_wqe_write_vlan(txq, wqe, addr, length,
+						    lkey, buf->vlan_tci);
+			else
+				mlx5_wqe_write(txq, wqe, addr, length, lkey);
+		}
+		wqe->inl.ctrl.data[2] = 0;
+		elts_head = elts_head_next;
+		buf = pkts[i + 1];
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment sent bytes counter. */
+		txq->stats.obytes += length;
+#endif
+	}
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Check whether completion threshold has been reached. */
+	comp = txq->elts_comp + i;
+	if (comp >= MLX5_TX_COMP_THRESH) {
+		/* Request completion on last WQE. */
+		wqe->inl.ctrl.data[2] = htonl(8);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		wqe->inl.ctrl.data[3] = elts_head;
+		txq->elts_comp = 0;
+	} else {
+		txq->elts_comp = comp;
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += i;
+#endif
+	/* Ring QP doorbell. */
+	mlx5_tx_dbrec(txq);
+	txq->elts_head = elts_head;
+	return i;
+}
+
+/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index f900e65..3c83148 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -246,6 +246,7 @@ struct txq {
 	uint16_t wqe_n; /* Number of WQ elements. */
 	uint16_t bf_offset; /* Blueflame offset. */
 	uint16_t bf_buf_size; /* Blueflame size. */
+	uint16_t max_inline; /* Maximum size to inline in a WQE. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
 	volatile union mlx5_wqe (*wqes)[]; /* Work queue. */
@@ -310,6 +311,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 /* mlx5_rxtx.c */
 
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 7b2dc7c..6a4a96e 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -332,6 +332,10 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
 			      IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
 	};
+	if (priv->txq_inline && priv->txqs_n >= priv->txqs_inline) {
+		tmpl.txq.max_inline = priv->txq_inline;
+		attr.init.cap.max_inline_data = tmpl.txq.max_inline;
+	}
 	tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
 	if (tmpl.qp == NULL) {
 		ret = (errno ? errno : EINVAL);
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 19/25] mlx5: add debugging information about Tx queues capabilities
Date: Thu, 23 Jun 2016 19:05:17 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170517.ycK2rPTtNvFxo09oqOOtXxyNg3B1-eb0LJbTW_v0oug@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_txq.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4f17fb0..bae9f3d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -343,6 +343,11 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
+	DEBUG("TX queue capabilities: max_send_wr=%u, max_send_sge=%u,"
+	      " max_inline_data=%u",
+	      attr.init.cap.max_send_wr,
+	      attr.init.cap.max_send_sge,
+	      attr.init.cap.max_inline_data);
 	attr.mod = (struct ibv_exp_qp_attr){
 		/* Move the QP to this state. */
 		.qp_state = IBV_QPS_INIT,
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>,
	Olga Shern <olgas@mellanox.com>
Subject: [dpdk-dev] [PATCH v5 18/25] mlx5: add support for multi-packet send
Date: Thu, 23 Jun 2016 19:05:16 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170516.w4iJra1HFfRZ_zHAelebpEf0mt8Paa-tfm0_Yaw9NM4@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

This feature enables the TX burst function to emit up to 5 packets using
only two WQEs on devices that support it. Saves PCI bandwidth and improves
performance.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Olga Shern <olgas@mellanox.com>
---
 doc/guides/nics/mlx5.rst       |  10 +
 drivers/net/mlx5/mlx5.c        |  14 +-
 drivers/net/mlx5/mlx5_ethdev.c |  15 +-
 drivers/net/mlx5/mlx5_rxtx.c   | 407 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h   |   2 +
 drivers/net/mlx5/mlx5_txq.c    |   2 +-
 6 files changed, 446 insertions(+), 4 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 9ada221..063c4a5 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -171,6 +171,16 @@ Run-time configuration
 
   This option should be used in combination with ``txq_inline`` above.
 
+- ``txq_mpw_en`` parameter [int]
+
+  A nonzero value enables multi-packet send. This feature allows the TX
+  burst function to pack up to five packets in two descriptors in order to
+  save PCI bandwidth and improve performance at the cost of a slightly
+  higher CPU usage.
+
+  It is currently only supported on the ConnectX-4 Lx family of adapters.
+  Enabled by default.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 2d63a48..0e83dd5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -81,6 +81,9 @@
  */
 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
 
+/* Device parameter to enable multi-packet send WQEs. */
+#define MLX5_TXQ_MPW_EN "txq_mpw_en"
+
 /**
  * Retrieve integer value from environment variable.
  *
@@ -282,6 +285,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 		priv->txq_inline = tmp;
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		priv->txqs_inline = tmp;
+	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
+		priv->mps = !!tmp;
 	} else {
 		WARN("%s: unknown parameter", key);
 		return -EINVAL;
@@ -307,6 +312,7 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
 		MLX5_RXQ_CQE_COMP_EN,
 		MLX5_TXQ_INLINE,
 		MLX5_TXQS_MIN_INLINE,
+		MLX5_TXQ_MPW_EN,
 	};
 	struct rte_kvargs *kvlist;
 	int ret = 0;
@@ -502,6 +508,7 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
+		priv->mps = mps; /* Enable MPW by default if supported. */
 		priv->cqe_comp = 1; /* Enable compression by default. */
 		err = mlx5_args(priv, pci_dev->devargs);
 		if (err) {
@@ -550,7 +557,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 
 		priv_get_num_vfs(priv, &num_vfs);
 		priv->sriov = (num_vfs || sriov);
-		priv->mps = mps;
+		if (priv->mps && !mps) {
+			ERROR("multi-packet send not supported on this device"
+			      " (" MLX5_TXQ_MPW_EN ")");
+			err = ENOTSUP;
+			goto port_error;
+		}
 		/* Allocate and register default RSS hash keys. */
 		priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
 					    sizeof((*priv->rss_conf)[0]), 0);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index aeea4ff..698a50e 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -584,7 +584,8 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 		  DEV_RX_OFFLOAD_UDP_CKSUM |
 		  DEV_RX_OFFLOAD_TCP_CKSUM) :
 		 0);
-	info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
+	if (!priv->mps)
+		info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
 	if (priv->hw_csum)
 		info->tx_offload_capa |=
 			(DEV_TX_OFFLOAD_IPV4_CKSUM |
@@ -1318,7 +1319,17 @@ void
 priv_select_tx_function(struct priv *priv)
 {
 	priv->dev->tx_pkt_burst = mlx5_tx_burst;
-	if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+	/* Display warning for unsupported configurations. */
+	if (priv->sriov && priv->mps)
+		WARN("multi-packet send WQE cannot be used on a SR-IOV setup");
+	/* Select appropriate TX function. */
+	if ((priv->sriov == 0) && priv->mps && priv->txq_inline) {
+		priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
+		DEBUG("selected MPW inline TX function");
+	} else if ((priv->sriov == 0) && priv->mps) {
+		priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw;
+		DEBUG("selected MPW TX function");
+	} else if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
 		priv->dev->tx_pkt_burst = mlx5_tx_burst_inline;
 		DEBUG("selected inline TX function (%u >= %u queues)",
 		      priv->txqs_n, priv->txqs_inline);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index daa22d9..ed2b5fe 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -802,6 +802,413 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 }
 
 /**
+ * Open a MPW session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ * @param length
+ *   Packet length.
+ */
+static inline void
+mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
+		(volatile struct mlx5_wqe_data_seg (*)[])
+		(uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
+
+	mpw->state = MLX5_MPW_STATE_OPENED;
+	mpw->pkts_n = 0;
+	mpw->len = length;
+	mpw->total_len = 0;
+	mpw->wqe = &(*txq->wqes)[idx];
+	mpw->wqe->mpw.eseg.mss = htons(length);
+	mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
+	mpw->wqe->mpw.eseg.rsvd0 = 0;
+	mpw->wqe->mpw.eseg.rsvd1 = 0;
+	mpw->wqe->mpw.eseg.rsvd2 = 0;
+	mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+					   (txq->wqe_ci << 8) |
+					   MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->mpw.ctrl.data[2] = 0;
+	mpw->wqe->mpw.ctrl.data[3] = 0;
+	mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
+	mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
+	mpw->data.dseg[2] = &(*dseg)[0];
+	mpw->data.dseg[3] = &(*dseg)[1];
+	mpw->data.dseg[4] = &(*dseg)[2];
+}
+
+/**
+ * Close a MPW session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+	unsigned int num = mpw->pkts_n;
+
+	/*
+	 * Store size in multiple of 16 bytes. Control and Ethernet segments
+	 * count as 2.
+	 */
+	mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
+	mpw->state = MLX5_MPW_STATE_CLOSED;
+	if (num < 3)
+		++txq->wqe_ci;
+	else
+		txq->wqe_ci += 2;
+	tx_prefetch_wqe(txq, txq->wqe_ci);
+	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+}
+
+/**
+ * DPDK callback for TX with MPW support.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	uint16_t elts_head = txq->elts_head;
+	const unsigned int elts_n = txq->elts_n;
+	unsigned int i;
+	unsigned int max;
+	unsigned int comp;
+	struct mlx5_mpw mpw = {
+		.state = MLX5_MPW_STATE_CLOSED,
+	};
+
+	/* Prefetch first packet cacheline. */
+	tx_prefetch_cqe(txq, txq->cq_ci);
+	tx_prefetch_wqe(txq, txq->wqe_ci);
+	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+	/* Start processing. */
+	txq_complete(txq);
+	max = (elts_n - (elts_head - txq->elts_tail));
+	if (max > elts_n)
+		max -= elts_n;
+	assert(max >= 1);
+	assert(max <= elts_n);
+	/* Always leave one free entry in the ring. */
+	--max;
+	if (max == 0)
+		return 0;
+	if (max > pkts_n)
+		max = pkts_n;
+	for (i = 0; (i != max); ++i) {
+		struct rte_mbuf *buf = pkts[i];
+		volatile struct mlx5_wqe_data_seg *dseg;
+		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+		uintptr_t addr;
+		uint32_t length;
+		uint32_t cs_flags = 0;
+
+		/* Should we enable HW CKSUM offload */
+		if (buf->ol_flags &
+		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+		/* Retrieve buffer information. */
+		addr = rte_pktmbuf_mtod(buf, uintptr_t);
+		length = DATA_LEN(buf);
+		/* Update element. */
+		(*txq->elts)[elts_head] = buf;
+		/* Start new session if packet differs. */
+		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
+		    ((mpw.len != length) ||
+		     (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
+			mlx5_mpw_close(txq, &mpw);
+		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+			mlx5_mpw_new(txq, &mpw, length);
+			mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+		}
+		dseg = mpw.data.dseg[mpw.pkts_n];
+		*dseg = (struct mlx5_wqe_data_seg){
+			.byte_count = htonl(length),
+			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+			.addr = htonll(addr),
+		};
+		++mpw.pkts_n;
+		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+			mlx5_mpw_close(txq, &mpw);
+		elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment sent bytes counter. */
+		txq->stats.obytes += length;
+#endif
+	}
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Check whether completion threshold has been reached. */
+	comp = txq->elts_comp + i;
+	if (comp >= MLX5_TX_COMP_THRESH) {
+		volatile union mlx5_wqe *wqe = mpw.wqe;
+
+		/* Request completion on last WQE. */
+		wqe->mpw.ctrl.data[2] = htonl(8);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		wqe->mpw.ctrl.data[3] = elts_head;
+		txq->elts_comp = 0;
+	} else {
+		txq->elts_comp = comp;
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += i;
+#endif
+	/* Ring QP doorbell. */
+	if (mpw.state == MLX5_MPW_STATE_OPENED)
+		mlx5_mpw_close(txq, &mpw);
+	mlx5_tx_dbrec(txq);
+	txq->elts_head = elts_head;
+	return i;
+}
+
+/**
+ * Open a MPW inline session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ * @param length
+ *   Packet length.
+ */
+static inline void
+mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
+{
+	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
+
+	mpw->state = MLX5_MPW_INL_STATE_OPENED;
+	mpw->pkts_n = 0;
+	mpw->len = length;
+	mpw->total_len = 0;
+	mpw->wqe = &(*txq->wqes)[idx];
+	mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
+					       (txq->wqe_ci << 8) |
+					       MLX5_OPCODE_LSO_MPW);
+	mpw->wqe->mpw_inl.ctrl.data[2] = 0;
+	mpw->wqe->mpw_inl.ctrl.data[3] = 0;
+	mpw->wqe->mpw_inl.eseg.mss = htons(length);
+	mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
+	mpw->wqe->mpw_inl.eseg.cs_flags = 0;
+	mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
+	mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
+	mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
+	mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
+}
+
+/**
+ * Close a MPW inline session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ */
+static inline void
+mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+	unsigned int size;
+
+	size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
+	/*
+	 * Store size in multiple of 16 bytes. Control and Ethernet segments
+	 * count as 2.
+	 */
+	mpw->wqe->mpw_inl.ctrl.data[1] =
+		htonl(txq->qp_num_8s | ((size + 15) / 16));
+	mpw->state = MLX5_MPW_STATE_CLOSED;
+	mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
+	txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
+}
+
+/**
+ * DPDK callback for TX with MPW inline support.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
+			 uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	uint16_t elts_head = txq->elts_head;
+	const unsigned int elts_n = txq->elts_n;
+	unsigned int i;
+	unsigned int max;
+	unsigned int comp;
+	unsigned int inline_room = txq->max_inline;
+	struct mlx5_mpw mpw = {
+		.state = MLX5_MPW_STATE_CLOSED,
+	};
+
+	/* Prefetch first packet cacheline. */
+	tx_prefetch_cqe(txq, txq->cq_ci);
+	tx_prefetch_wqe(txq, txq->wqe_ci);
+	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+	/* Start processing. */
+	txq_complete(txq);
+	max = (elts_n - (elts_head - txq->elts_tail));
+	if (max > elts_n)
+		max -= elts_n;
+	assert(max >= 1);
+	assert(max <= elts_n);
+	/* Always leave one free entry in the ring. */
+	--max;
+	if (max == 0)
+		return 0;
+	if (max > pkts_n)
+		max = pkts_n;
+	for (i = 0; (i != max); ++i) {
+		struct rte_mbuf *buf = pkts[i];
+		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+		uintptr_t addr;
+		uint32_t length;
+		uint32_t cs_flags = 0;
+
+		/* Should we enable HW CKSUM offload */
+		if (buf->ol_flags &
+		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+		/* Retrieve buffer information. */
+		addr = rte_pktmbuf_mtod(buf, uintptr_t);
+		length = DATA_LEN(buf);
+		/* Update element. */
+		(*txq->elts)[elts_head] = buf;
+		/* Start new session if packet differs. */
+		if (mpw.state == MLX5_MPW_STATE_OPENED) {
+			if ((mpw.len != length) ||
+			    (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
+				mlx5_mpw_close(txq, &mpw);
+		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
+			if ((mpw.len != length) ||
+			    (length > inline_room) ||
+			    (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
+				mlx5_mpw_inline_close(txq, &mpw);
+				inline_room = txq->max_inline;
+			}
+		}
+		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+			if (length > inline_room) {
+				mlx5_mpw_new(txq, &mpw, length);
+				mpw.wqe->mpw.eseg.cs_flags = cs_flags;
+			} else {
+				mlx5_mpw_inline_new(txq, &mpw, length);
+				mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
+			}
+		}
+		if (mpw.state == MLX5_MPW_STATE_OPENED) {
+			volatile struct mlx5_wqe_data_seg *dseg;
+
+			assert(inline_room == txq->max_inline);
+			dseg = mpw.data.dseg[mpw.pkts_n];
+			*dseg = (struct mlx5_wqe_data_seg){
+				.byte_count = htonl(length),
+				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+				.addr = htonll(addr),
+			};
+			++mpw.pkts_n;
+			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
+				mlx5_mpw_close(txq, &mpw);
+		} else {
+			unsigned int max;
+
+			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
+			assert(length <= inline_room);
+			/* Maximum number of bytes before wrapping. */
+			max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
+			       (uintptr_t)mpw.data.raw);
+			if (length > max) {
+				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+					   (void *)addr,
+					   max);
+				mpw.data.raw =
+					(volatile void *)&(*txq->wqes)[0];
+				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+					   (void *)(addr + max),
+					   length - max);
+				mpw.data.raw += length - max;
+			} else {
+				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
+					   (void *)addr,
+					   length);
+				mpw.data.raw += length;
+			}
+			if ((uintptr_t)mpw.data.raw ==
+			    (uintptr_t)&(*txq->wqes)[txq->wqe_n])
+				mpw.data.raw =
+					(volatile void *)&(*txq->wqes)[0];
+			++mpw.pkts_n;
+			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
+				mlx5_mpw_inline_close(txq, &mpw);
+				inline_room = txq->max_inline;
+			} else {
+				inline_room -= length;
+			}
+		}
+		mpw.total_len += length;
+		elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment sent bytes counter. */
+		txq->stats.obytes += length;
+#endif
+	}
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Check whether completion threshold has been reached. */
+	comp = txq->elts_comp + i;
+	if (comp >= MLX5_TX_COMP_THRESH) {
+		volatile union mlx5_wqe *wqe = mpw.wqe;
+
+		/* Request completion on last WQE. */
+		wqe->mpw_inl.ctrl.data[2] = htonl(8);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		wqe->mpw_inl.ctrl.data[3] = elts_head;
+		txq->elts_comp = 0;
+	} else {
+		txq->elts_comp = comp;
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += i;
+#endif
+	/* Ring QP doorbell. */
+	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
+		mlx5_mpw_inline_close(txq, &mpw);
+	else if (mpw.state == MLX5_MPW_STATE_OPENED)
+		mlx5_mpw_close(txq, &mpw);
+	mlx5_tx_dbrec(txq);
+	txq->elts_head = elts_head;
+	return i;
+}
+
+/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3c83148..41605f9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -312,6 +312,8 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst_inline(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 6a4a96e..4f17fb0 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -398,7 +398,7 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		.obj = tmpl.qp,
 		/* Enable multi-packet send if supported. */
 		.family_flags =
-			(priv->mps ?
+			((priv->mps && !priv->sriov) ?
 			 IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR :
 			 0),
 	};
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 20/25] mlx5: check remaining space while processing Tx burst
Date: Thu, 23 Jun 2016 19:05:18 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170518.f9VyA9OuLS2Ha8Rk2k3KHXl918AYC5PnRA9LY5GEOOE@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

The space necessary to store segmented packets cannot be known in advance
and must be verified for each of them.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 144 +++++++++++++++++++++++--------------------
 1 file changed, 78 insertions(+), 66 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ed2b5fe..fadc182 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -585,50 +585,51 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int i;
+	unsigned int i = 0;
 	unsigned int max;
 	unsigned int comp;
 	volatile union mlx5_wqe *wqe;
-	struct rte_mbuf *buf;
 
 	if (unlikely(!pkts_n))
 		return 0;
-	buf = pkts[0];
 	/* Prefetch first packet cacheline. */
 	tx_prefetch_cqe(txq, txq->cq_ci);
 	tx_prefetch_cqe(txq, txq->cq_ci + 1);
-	rte_prefetch0(buf);
+	rte_prefetch0(*pkts);
 	/* Start processing. */
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
-	assert(max >= 1);
-	assert(max <= elts_n);
-	/* Always leave one free entry in the ring. */
-	--max;
-	if (max == 0)
-		return 0;
-	if (max > pkts_n)
-		max = pkts_n;
-	for (i = 0; (i != max); ++i) {
-		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+	do {
+		struct rte_mbuf *buf;
+		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t lkey;
 
+		/*
+		 * Make sure there is enough room to store this packet and
+		 * that one ring entry remains unused.
+		 */
+		if (max < 1 + 1)
+			break;
+		--max;
+		--pkts_n;
+		buf = *(pkts++);
+		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
 		rte_prefetch0(wqe);
-		if (i + 1 < max)
-			rte_prefetch0(pkts[i + 1]);
+		if (pkts_n)
+			rte_prefetch0(*pkts);
 		/* Retrieve buffer information. */
 		addr = rte_pktmbuf_mtod(buf, uintptr_t);
 		length = DATA_LEN(buf);
 		/* Update element. */
 		(*txq->elts)[elts_head] = buf;
 		/* Prefetch next buffer data. */
-		if (i + 1 < max)
-			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+		if (pkts_n)
+			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
 						       volatile void *));
 		/* Retrieve Memory Region key for this memory pool. */
 		lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -652,8 +653,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		txq->stats.obytes += length;
 #endif
 		elts_head = elts_head_next;
-		buf = pkts[i + 1];
-	}
+		++i;
+	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
@@ -697,44 +698,45 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int i;
+	unsigned int i = 0;
 	unsigned int max;
 	unsigned int comp;
 	volatile union mlx5_wqe *wqe;
-	struct rte_mbuf *buf;
 	unsigned int max_inline = txq->max_inline;
 
 	if (unlikely(!pkts_n))
 		return 0;
-	buf = pkts[0];
 	/* Prefetch first packet cacheline. */
 	tx_prefetch_cqe(txq, txq->cq_ci);
 	tx_prefetch_cqe(txq, txq->cq_ci + 1);
-	rte_prefetch0(buf);
+	rte_prefetch0(*pkts);
 	/* Start processing. */
 	txq_complete(txq);
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
-	assert(max >= 1);
-	assert(max <= elts_n);
-	/* Always leave one free entry in the ring. */
-	--max;
-	if (max == 0)
-		return 0;
-	if (max > pkts_n)
-		max = pkts_n;
-	for (i = 0; (i != max); ++i) {
-		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+	do {
+		struct rte_mbuf *buf;
+		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t lkey;
 
+		/*
+		 * Make sure there is enough room to store this packet and
+		 * that one ring entry remains unused.
+		 */
+		if (max < 1 + 1)
+			break;
+		--max;
+		--pkts_n;
+		buf = *(pkts++);
+		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
-		if (i + 1 < max)
-			rte_prefetch0(pkts[i + 1]);
+		if (pkts_n)
+			rte_prefetch0(*pkts);
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -750,8 +752,8 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Update element. */
 		(*txq->elts)[elts_head] = buf;
 		/* Prefetch next buffer data. */
-		if (i + 1 < max)
-			rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1],
+		if (pkts_n)
+			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
 						       volatile void *));
 		if (length <= max_inline) {
 			if (buf->ol_flags & PKT_TX_VLAN_PKT)
@@ -771,12 +773,12 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		}
 		wqe->inl.ctrl.data[2] = 0;
 		elts_head = elts_head_next;
-		buf = pkts[i + 1];
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += length;
 #endif
-	}
+		++i;
+	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
@@ -887,13 +889,15 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int i;
+	unsigned int i = 0;
 	unsigned int max;
 	unsigned int comp;
 	struct mlx5_mpw mpw = {
 		.state = MLX5_MPW_STATE_CLOSED,
 	};
 
+	if (unlikely(!pkts_n))
+		return 0;
 	/* Prefetch first packet cacheline. */
 	tx_prefetch_cqe(txq, txq->cq_ci);
 	tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -903,22 +907,24 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
-	assert(max >= 1);
-	assert(max <= elts_n);
-	/* Always leave one free entry in the ring. */
-	--max;
-	if (max == 0)
-		return 0;
-	if (max > pkts_n)
-		max = pkts_n;
-	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
+	do {
+		struct rte_mbuf *buf;
 		volatile struct mlx5_wqe_data_seg *dseg;
-		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t cs_flags = 0;
 
+		/*
+		 * Make sure there is enough room to store this packet and
+		 * that one ring entry remains unused.
+		 */
+		if (max < 1 + 1)
+			break;
+		--max;
+		--pkts_n;
+		buf = *(pkts++);
+		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -951,7 +957,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += length;
 #endif
-	}
+		++i;
+	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
@@ -1059,7 +1066,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	struct txq *txq = (struct txq *)dpdk_txq;
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
-	unsigned int i;
+	unsigned int i = 0;
 	unsigned int max;
 	unsigned int comp;
 	unsigned int inline_room = txq->max_inline;
@@ -1067,6 +1074,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		.state = MLX5_MPW_STATE_CLOSED,
 	};
 
+	if (unlikely(!pkts_n))
+		return 0;
 	/* Prefetch first packet cacheline. */
 	tx_prefetch_cqe(txq, txq->cq_ci);
 	tx_prefetch_wqe(txq, txq->wqe_ci);
@@ -1076,21 +1085,23 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
-	assert(max >= 1);
-	assert(max <= elts_n);
-	/* Always leave one free entry in the ring. */
-	--max;
-	if (max == 0)
-		return 0;
-	if (max > pkts_n)
-		max = pkts_n;
-	for (i = 0; (i != max); ++i) {
-		struct rte_mbuf *buf = pkts[i];
-		unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1);
+	do {
+		struct rte_mbuf *buf;
+		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t cs_flags = 0;
 
+		/*
+		 * Make sure there is enough room to store this packet and
+		 * that one ring entry remains unused.
+		 */
+		if (max < 1 + 1)
+			break;
+		--max;
+		--pkts_n;
+		buf = *(pkts++);
+		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -1177,7 +1188,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += length;
 #endif
-	}
+		++i;
+	} while (pkts_n);
 	/* Take a shortcut if nothing must be sent. */
 	if (unlikely(i == 0))
 		return 0;
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 21/25] mlx5: resurrect Tx gather support
Date: Thu, 23 Jun 2016 19:05:19 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170519.z3UN-HjsmycATyh3QP4fHLNnBrtGUTCcCqy6toeiNVY@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

Compared to its previous incarnation, the software limit on the number of
mbuf segments is no more (previously MLX5_PMD_SGE_WR_N, set to 4 by
default) hence no need for linearization code and related buffers that
permanently consumed a non negligible amount of memory to handle oversized
mbufs.

The resulting code is both lighter and faster.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 235 +++++++++++++++++++++++++++++++++----------
 drivers/net/mlx5/mlx5_txq.c  |   8 +-
 2 files changed, 188 insertions(+), 55 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index fadc182..c72e7ce 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -303,6 +303,7 @@ mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
 {
 	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
 	wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+	wqe->wqe.ctrl.data[2] = 0;
 	wqe->wqe.ctrl.data[3] = 0;
 	wqe->inl.eseg.rsvd0 = 0;
 	wqe->inl.eseg.rsvd1 = 0;
@@ -348,6 +349,7 @@ mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
 
 	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
 	wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
+	wqe->wqe.ctrl.data[2] = 0;
 	wqe->wqe.ctrl.data[3] = 0;
 	wqe->inl.eseg.rsvd0 = 0;
 	wqe->inl.eseg.rsvd1 = 0;
@@ -425,6 +427,7 @@ mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
 	assert(size < 64);
 	wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
 	wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+	wqe->inl.ctrl.data[2] = 0;
 	wqe->inl.ctrl.data[3] = 0;
 	wqe->inl.eseg.rsvd0 = 0;
 	wqe->inl.eseg.rsvd1 = 0;
@@ -498,6 +501,7 @@ mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
 	assert(size < 64);
 	wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
 	wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
+	wqe->inl.ctrl.data[2] = 0;
 	wqe->inl.ctrl.data[3] = 0;
 	wqe->inl.eseg.rsvd0 = 0;
 	wqe->inl.eseg.rsvd1 = 0;
@@ -586,6 +590,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
 	unsigned int i = 0;
+	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
 	volatile union mlx5_wqe *wqe;
@@ -602,23 +607,27 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf;
+		struct rte_mbuf *buf = *(pkts++);
 		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t lkey;
+		unsigned int segs_n = buf->nb_segs;
+		volatile struct mlx5_wqe_data_seg *dseg;
+		unsigned int ds = sizeof(*wqe) / 16;
 
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
 		 */
-		if (max < 1 + 1)
+		assert(segs_n);
+		if (max < segs_n + 1)
 			break;
-		--max;
+		max -= segs_n;
 		--pkts_n;
-		buf = *(pkts++);
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		dseg = &wqe->wqe.dseg;
 		rte_prefetch0(wqe);
 		if (pkts_n)
 			rte_prefetch0(*pkts);
@@ -638,7 +647,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 					    buf->vlan_tci);
 		else
 			mlx5_wqe_write(txq, wqe, addr, length, lkey);
-		wqe->wqe.ctrl.data[2] = 0;
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
@@ -648,6 +656,37 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		} else {
 			wqe->wqe.eseg.cs_flags = 0;
 		}
+		while (--segs_n) {
+			/*
+			 * Spill on next WQE when the current one does not have
+			 * enough room left. Size of WQE must a be a multiple
+			 * of data segment size.
+			 */
+			assert(!(sizeof(*wqe) % sizeof(*dseg)));
+			if (!(ds % (sizeof(*wqe) / 16)))
+				dseg = (volatile void *)
+					&(*txq->wqes)[txq->wqe_ci++ &
+						      (txq->wqe_n - 1)];
+			else
+				++dseg;
+			++ds;
+			buf = buf->next;
+			assert(buf);
+			/* Store segment information. */
+			dseg->byte_count = htonl(DATA_LEN(buf));
+			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+			(*txq->elts)[elts_head_next] = buf;
+			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			length += DATA_LEN(buf);
+#endif
+			++j;
+		}
+		/* Update DS field in WQE. */
+		wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
+		wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
+		elts_head = elts_head_next;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
 		txq->stats.obytes += length;
@@ -659,7 +698,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (unlikely(i == 0))
 		return 0;
 	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i;
+	comp = txq->elts_comp + i + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		/* Request completion on last WQE. */
 		wqe->wqe.ctrl.data[2] = htonl(8);
@@ -699,6 +738,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
 	unsigned int i = 0;
+	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
 	volatile union mlx5_wqe *wqe;
@@ -716,23 +756,27 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf;
+		struct rte_mbuf *buf = *(pkts++);
 		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
 		uint32_t lkey;
+		unsigned int segs_n = buf->nb_segs;
+		volatile struct mlx5_wqe_data_seg *dseg;
+		unsigned int ds = sizeof(*wqe) / 16;
 
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
 		 */
-		if (max < 1 + 1)
+		assert(segs_n);
+		if (max < segs_n + 1)
 			break;
-		--max;
+		max -= segs_n;
 		--pkts_n;
-		buf = *(pkts++);
 		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
+		dseg = &wqe->wqe.dseg;
 		tx_prefetch_wqe(txq, txq->wqe_ci);
 		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
 		if (pkts_n)
@@ -755,13 +799,14 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (pkts_n)
 			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
 						       volatile void *));
-		if (length <= max_inline) {
+		if ((length <= max_inline) && (segs_n == 1)) {
 			if (buf->ol_flags & PKT_TX_VLAN_PKT)
 				mlx5_wqe_write_inline_vlan(txq, wqe,
 							   addr, length,
 							   buf->vlan_tci);
 			else
 				mlx5_wqe_write_inline(txq, wqe, addr, length);
+			goto skip_segs;
 		} else {
 			/* Retrieve Memory Region key for this memory pool. */
 			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
@@ -771,7 +816,37 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			else
 				mlx5_wqe_write(txq, wqe, addr, length, lkey);
 		}
-		wqe->inl.ctrl.data[2] = 0;
+		while (--segs_n) {
+			/*
+			 * Spill on next WQE when the current one does not have
+			 * enough room left. Size of WQE must a be a multiple
+			 * of data segment size.
+			 */
+			assert(!(sizeof(*wqe) % sizeof(*dseg)));
+			if (!(ds % (sizeof(*wqe) / 16)))
+				dseg = (volatile void *)
+					&(*txq->wqes)[txq->wqe_ci++ &
+						      (txq->wqe_n - 1)];
+			else
+				++dseg;
+			++ds;
+			buf = buf->next;
+			assert(buf);
+			/* Store segment information. */
+			dseg->byte_count = htonl(DATA_LEN(buf));
+			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
+			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+			(*txq->elts)[elts_head_next] = buf;
+			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+			length += DATA_LEN(buf);
+#endif
+			++j;
+		}
+		/* Update DS field in WQE. */
+		wqe->inl.ctrl.data[1] &= htonl(0xffffffc0);
+		wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f);
+skip_segs:
 		elts_head = elts_head_next;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment sent bytes counter. */
@@ -783,7 +858,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (unlikely(i == 0))
 		return 0;
 	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i;
+	comp = txq->elts_comp + i + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		/* Request completion on last WQE. */
 		wqe->inl.ctrl.data[2] = htonl(8);
@@ -890,6 +965,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
 	unsigned int i = 0;
+	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
 	struct mlx5_mpw mpw = {
@@ -908,48 +984,69 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf;
-		volatile struct mlx5_wqe_data_seg *dseg;
+		struct rte_mbuf *buf = *(pkts++);
 		unsigned int elts_head_next;
-		uintptr_t addr;
 		uint32_t length;
+		unsigned int segs_n = buf->nb_segs;
 		uint32_t cs_flags = 0;
 
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
 		 */
-		if (max < 1 + 1)
+		assert(segs_n);
+		if (max < segs_n + 1)
 			break;
-		--max;
+		/* Do not bother with large packets MPW cannot handle. */
+		if (segs_n > MLX5_MPW_DSEG_MAX)
+			break;
+		max -= segs_n;
 		--pkts_n;
-		buf = *(pkts++);
-		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
-		/* Retrieve buffer information. */
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		length = DATA_LEN(buf);
-		/* Update element. */
-		(*txq->elts)[elts_head] = buf;
+		/* Retrieve packet information. */
+		length = PKT_LEN(buf);
+		assert(length);
 		/* Start new session if packet differs. */
 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
 		    ((mpw.len != length) ||
+		     (segs_n != 1) ||
 		     (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
 			mlx5_mpw_new(txq, &mpw, length);
 			mpw.wqe->mpw.eseg.cs_flags = cs_flags;
 		}
-		dseg = mpw.data.dseg[mpw.pkts_n];
-		*dseg = (struct mlx5_wqe_data_seg){
-			.byte_count = htonl(length),
-			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
-			.addr = htonll(addr),
-		};
-		++mpw.pkts_n;
+		/* Multi-segment packets must be alone in their MPW. */
+		assert((segs_n == 1) || (mpw.pkts_n == 0));
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+		length = 0;
+#endif
+		do {
+			volatile struct mlx5_wqe_data_seg *dseg;
+			uintptr_t addr;
+
+			elts_head_next = (elts_head + 1) & (elts_n - 1);
+			assert(buf);
+			(*txq->elts)[elts_head] = buf;
+			dseg = mpw.data.dseg[mpw.pkts_n];
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			*dseg = (struct mlx5_wqe_data_seg){
+				.byte_count = htonl(DATA_LEN(buf)),
+				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+				.addr = htonll(addr),
+			};
+			elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+			length += DATA_LEN(buf);
+#endif
+			buf = buf->next;
+			++mpw.pkts_n;
+			++j;
+		} while (--segs_n);
+		assert(length == mpw.len);
 		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
 			mlx5_mpw_close(txq, &mpw);
 		elts_head = elts_head_next;
@@ -963,7 +1060,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	if (unlikely(i == 0))
 		return 0;
 	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i;
+	/* "j" includes both packets and segments. */
+	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		volatile union mlx5_wqe *wqe = mpw.wqe;
 
@@ -1067,6 +1165,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	uint16_t elts_head = txq->elts_head;
 	const unsigned int elts_n = txq->elts_n;
 	unsigned int i = 0;
+	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
 	unsigned int inline_room = txq->max_inline;
@@ -1086,38 +1185,40 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	if (max > elts_n)
 		max -= elts_n;
 	do {
-		struct rte_mbuf *buf;
+		struct rte_mbuf *buf = *(pkts++);
 		unsigned int elts_head_next;
 		uintptr_t addr;
 		uint32_t length;
+		unsigned int segs_n = buf->nb_segs;
 		uint32_t cs_flags = 0;
 
 		/*
 		 * Make sure there is enough room to store this packet and
 		 * that one ring entry remains unused.
 		 */
-		if (max < 1 + 1)
+		assert(segs_n);
+		if (max < segs_n + 1)
+			break;
+		/* Do not bother with large packets MPW cannot handle. */
+		if (segs_n > MLX5_MPW_DSEG_MAX)
 			break;
-		--max;
+		max -= segs_n;
 		--pkts_n;
-		buf = *(pkts++);
-		elts_head_next = (elts_head + 1) & (elts_n - 1);
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
 			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
-		/* Retrieve buffer information. */
-		addr = rte_pktmbuf_mtod(buf, uintptr_t);
-		length = DATA_LEN(buf);
-		/* Update element. */
-		(*txq->elts)[elts_head] = buf;
+		/* Retrieve packet information. */
+		length = PKT_LEN(buf);
 		/* Start new session if packet differs. */
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
 			if ((mpw.len != length) ||
+			    (segs_n != 1) ||
 			    (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
 				mlx5_mpw_close(txq, &mpw);
 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
 			if ((mpw.len != length) ||
+			    (segs_n != 1) ||
 			    (length > inline_room) ||
 			    (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
 				mlx5_mpw_inline_close(txq, &mpw);
@@ -1125,7 +1226,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			}
 		}
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
-			if (length > inline_room) {
+			if ((segs_n != 1) ||
+			    (length > inline_room)) {
 				mlx5_mpw_new(txq, &mpw, length);
 				mpw.wqe->mpw.eseg.cs_flags = cs_flags;
 			} else {
@@ -1133,17 +1235,36 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
 			}
 		}
+		/* Multi-segment packets must be alone in their MPW. */
+		assert((segs_n == 1) || (mpw.pkts_n == 0));
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
-			volatile struct mlx5_wqe_data_seg *dseg;
-
 			assert(inline_room == txq->max_inline);
-			dseg = mpw.data.dseg[mpw.pkts_n];
-			*dseg = (struct mlx5_wqe_data_seg){
-				.byte_count = htonl(length),
-				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
-				.addr = htonll(addr),
-			};
-			++mpw.pkts_n;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+			length = 0;
+#endif
+			do {
+				volatile struct mlx5_wqe_data_seg *dseg;
+
+				elts_head_next =
+					(elts_head + 1) & (elts_n - 1);
+				assert(buf);
+				(*txq->elts)[elts_head] = buf;
+				dseg = mpw.data.dseg[mpw.pkts_n];
+				addr = rte_pktmbuf_mtod(buf, uintptr_t);
+				*dseg = (struct mlx5_wqe_data_seg){
+					.byte_count = htonl(DATA_LEN(buf)),
+					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+					.addr = htonll(addr),
+				};
+				elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+				length += DATA_LEN(buf);
+#endif
+				buf = buf->next;
+				++mpw.pkts_n;
+				++j;
+			} while (--segs_n);
+			assert(length == mpw.len);
 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
 				mlx5_mpw_close(txq, &mpw);
 		} else {
@@ -1151,6 +1272,10 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 
 			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
 			assert(length <= inline_room);
+			assert(length == DATA_LEN(buf));
+			elts_head_next = (elts_head + 1) & (elts_n - 1);
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			(*txq->elts)[elts_head] = buf;
 			/* Maximum number of bytes before wrapping. */
 			max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
 			       (uintptr_t)mpw.data.raw);
@@ -1175,6 +1300,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 				mpw.data.raw =
 					(volatile void *)&(*txq->wqes)[0];
 			++mpw.pkts_n;
+			++j;
 			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
 				mlx5_mpw_inline_close(txq, &mpw);
 				inline_room = txq->max_inline;
@@ -1194,7 +1320,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	if (unlikely(i == 0))
 		return 0;
 	/* Check whether completion threshold has been reached. */
-	comp = txq->elts_comp + i;
+	/* "j" includes both packets and segments. */
+	comp = txq->elts_comp + j;
 	if (comp >= MLX5_TX_COMP_THRESH) {
 		volatile union mlx5_wqe *wqe = mpw.wqe;
 
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index bae9f3d..6fe61c4 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -320,7 +320,13 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 			.max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
 					priv->device_attr.max_qp_wr :
 					desc),
-			/* Max number of scatter/gather elements in a WR. */
+			/*
+			 * Max number of scatter/gather elements in a WR,
+			 * must be 1 to prevent libmlx5 from trying to affect
+			 * too much memory. TX gather is not impacted by the
+			 * priv->device_attr.max_sge limit and will still work
+			 * properly.
+			 */
 			.max_send_sge = 1,
 		},
 		.qp_type = IBV_QPT_RAW_PACKET,
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 22/25] mlx5: work around spurious compilation errors
Date: Thu, 23 Jun 2016 19:05:20 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170520.fKsoI_I4pzY3yKUqOwfl0Aw3PxgkB_Ky_Yva4qFV87Q@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

Since commit "mlx5: resurrect Tx gather support", older GCC versions (such
as 4.8.5) may complain about the following:

 mlx5_rxtx.c: In function `mlx5_tx_burst':
 mlx5_rxtx.c:705:25: error: `wqe' may be used uninitialized in this
     function [-Werror=maybe-uninitialized]

 mlx5_rxtx.c: In function `mlx5_tx_burst_inline':
 mlx5_rxtx.c:864:25: error: `wqe' may be used uninitialized in this
     function [-Werror=maybe-uninitialized]

In both cases, this code cannot be reached when wqe is not initialized.

Considering older GCC versions are still widely used, work around this
issue by initializing wqe preemptively, even if it should not be necessary.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c72e7ce..8b67949 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -593,7 +593,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
-	volatile union mlx5_wqe *wqe;
+	volatile union mlx5_wqe *wqe = NULL;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -741,7 +741,7 @@ mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int j = 0;
 	unsigned int max;
 	unsigned int comp;
-	volatile union mlx5_wqe *wqe;
+	volatile union mlx5_wqe *wqe = NULL;
 	unsigned int max_inline = txq->max_inline;
 
 	if (unlikely(!pkts_n))
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>
Subject: [dpdk-dev] [PATCH v5 23/25] mlx5: remove redundant Rx queue initialization code
Date: Thu, 23 Jun 2016 19:05:21 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170521.ZKhAHkKEh4yda3rBEf6dVHnrUg4FAfTWyLYtXD8ljUc@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

Toggling RX checksum offloads is already done at initialization time. This
code does not belong in rxq_rehash().

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_rxq.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b2f8f9a..fbf14fa 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -798,7 +798,6 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 int
 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 {
-	struct priv *priv = rxq_ctrl->priv;
 	struct rxq_ctrl tmpl = *rxq_ctrl;
 	unsigned int mbuf_n;
 	unsigned int desc_n;
@@ -811,16 +810,6 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 	/* Number of descriptors and mbufs currently allocated. */
 	desc_n = tmpl.rxq.elts_n;
 	mbuf_n = desc_n;
-	/* Toggle RX checksum offload if hardware supports it. */
-	if (priv->hw_csum) {
-		tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-		rxq_ctrl->rxq.csum = tmpl.rxq.csum;
-	}
-	if (priv->hw_csum_l2tun) {
-		tmpl.rxq.csum_l2tun =
-			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
-		rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
-	}
 	/* From now on, any failure will render the queue unusable.
 	 * Reinitialize WQ. */
 	mod = (struct ibv_exp_wq_attr){
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>,
	Vasily Philipov <vasilyf@mellanox.com>
Subject: [dpdk-dev] [PATCH v5 24/25] mlx5: make Rx queue reinitialization safer
Date: Thu, 23 Jun 2016 19:05:22 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170522.aUY4uI8lBHnaD2fvyrjqw8Mq1wBmeYRd74CMgbpLwKM@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

From: Adrien Mazarguil <adrien.mazarguil@6wind.com>

The primary purpose of rxq_rehash() function is to stop and restart
reception on a queue after re-posting buffers. This may fail if the array
that temporarily stores existing buffers for reuse cannot be allocated.

Update rxq_rehash() to work on the target queue directly (not through a
template copy) and avoid this allocation.

rxq_alloc_elts() is modified accordingly to take buffers from an existing
queue directly and update their refcount.

Unlike rxq_rehash(), rxq_setup() must work on a temporary structure but
should not allocate new mbufs from the pool while reinitializing an
existing queue. This is achieved by using the refcount-aware
rxq_alloc_elts() before overwriting queue data.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxq.c | 83 ++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index fbf14fa..b2ddd0d 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -642,7 +642,7 @@ priv_rehash_flows(struct priv *priv)
  */
 static int
 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
-	       struct rte_mbuf **pool)
+	       struct rte_mbuf *(*pool)[])
 {
 	unsigned int i;
 	int ret = 0;
@@ -654,9 +654,10 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 			&(*rxq_ctrl->rxq.wqes)[i];
 
 		if (pool != NULL) {
-			buf = *(pool++);
+			buf = (*pool)[i];
 			assert(buf != NULL);
 			rte_pktmbuf_reset(buf);
+			rte_pktmbuf_refcnt_update(buf, 1);
 		} else
 			buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
 		if (buf == NULL) {
@@ -781,7 +782,7 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 }
 
 /**
- * Reconfigure a RX queue with new parameters.
+ * Reconfigure RX queue buffers.
  *
  * rxq_rehash() does not allocate mbufs, which, if not done from the right
  * thread (such as a control thread), may corrupt the pool.
@@ -798,67 +799,48 @@ rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
 int
 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 {
-	struct rxq_ctrl tmpl = *rxq_ctrl;
-	unsigned int mbuf_n;
-	unsigned int desc_n;
-	struct rte_mbuf **pool;
-	unsigned int i, k;
+	unsigned int elts_n = rxq_ctrl->rxq.elts_n;
+	unsigned int i;
 	struct ibv_exp_wq_attr mod;
 	int err;
 
 	DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
-	/* Number of descriptors and mbufs currently allocated. */
-	desc_n = tmpl.rxq.elts_n;
-	mbuf_n = desc_n;
 	/* From now on, any failure will render the queue unusable.
 	 * Reinitialize WQ. */
 	mod = (struct ibv_exp_wq_attr){
 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
 		.wq_state = IBV_EXP_WQS_RESET,
 	};
-	err = ibv_exp_modify_wq(tmpl.wq, &mod);
+	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
 	if (err) {
 		ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
 		assert(err > 0);
 		return err;
 	}
-	/* Allocate pool. */
-	pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
-	if (pool == NULL) {
-		ERROR("%p: cannot allocate memory", (void *)dev);
-		return ENOBUFS;
-	}
 	/* Snatch mbufs from original queue. */
-	k = 0;
-	for (i = 0; (i != desc_n); ++i)
-		pool[k++] = (*rxq_ctrl->rxq.elts)[i];
-	assert(k == mbuf_n);
-	rte_free(pool);
+	claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
+	for (i = 0; i != elts_n; ++i) {
+		struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
+
+		assert(rte_mbuf_refcnt_read(buf) == 2);
+		rte_pktmbuf_free_seg(buf);
+	}
 	/* Change queue state to ready. */
 	mod = (struct ibv_exp_wq_attr){
 		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
 		.wq_state = IBV_EXP_WQS_RDY,
 	};
-	err = ibv_exp_modify_wq(tmpl.wq, &mod);
+	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
 	if (err) {
 		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
 		      (void *)dev, strerror(err));
 		goto error;
 	}
-	/* Post SGEs. */
-	err = rxq_alloc_elts(&tmpl, desc_n, pool);
-	if (err) {
-		ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
-		rte_free(pool);
-		assert(err > 0);
-		return err;
-	}
 	/* Update doorbell counter. */
-	rxq_ctrl->rxq.rq_ci = desc_n;
+	rxq_ctrl->rxq.rq_ci = elts_n;
 	rte_wmb();
 	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
 error:
-	*rxq_ctrl = tmpl;
 	assert(err >= 0);
 	return err;
 }
@@ -868,24 +850,26 @@ error:
  *
  * @param tmpl
  *   Pointer to RX queue control template.
- * @param rxq_ctrl
- *   Pointer to RX queue control.
  *
  * @return
  *   0 on success, errno value on failure.
  */
 static inline int
-rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
+rxq_setup(struct rxq_ctrl *tmpl)
 {
 	struct ibv_cq *ibcq = tmpl->cq;
 	struct mlx5_cq *cq = to_mxxx(cq, cq);
 	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+	struct rte_mbuf *(*elts)[tmpl->rxq.elts_n] =
+		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
 
 	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
 		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
 		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
 		return EINVAL;
 	}
+	if (elts == NULL)
+		return ENOMEM;
 	tmpl->rxq.rq_db = rwq->rq.db;
 	tmpl->rxq.cqe_n = ibcq->cqe + 1;
 	tmpl->rxq.cq_ci = 0;
@@ -897,9 +881,7 @@ rxq_setup(struct rxq_ctrl *tmpl, struct rxq_ctrl *rxq_ctrl)
 	tmpl->rxq.cqes =
 		(volatile struct mlx5_cqe (*)[])
 		(uintptr_t)cq->active_buf->buf;
-	tmpl->rxq.elts =
-		(struct rte_mbuf *(*)[tmpl->rxq.elts_n])
-		((uintptr_t)rxq_ctrl + sizeof(*rxq_ctrl));
+	tmpl->rxq.elts = elts;
 	return 0;
 }
 
@@ -947,6 +929,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	enum ibv_exp_query_intf_status status;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
 	unsigned int cqe_n = desc - 1;
+	struct rte_mbuf *(*elts)[desc] = NULL;
 	int ret = 0;
 
 	(void)conf; /* Thresholds configuration (ignored). */
@@ -1104,13 +1087,19 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
-	ret = rxq_setup(&tmpl, rxq_ctrl);
+	ret = rxq_setup(&tmpl);
 	if (ret) {
 		ERROR("%p: cannot initialize RX queue structure: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
-	ret = rxq_alloc_elts(&tmpl, desc, NULL);
+	/* Reuse buffers from original queue if possible. */
+	if (rxq_ctrl->rxq.elts_n) {
+		assert(rxq_ctrl->rxq.elts_n == desc);
+		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
+		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
+	} else
+		ret = rxq_alloc_elts(&tmpl, desc, NULL);
 	if (ret) {
 		ERROR("%p: RXQ allocation failed: %s",
 		      (void *)dev, strerror(ret));
@@ -1119,6 +1108,14 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	/* Clean up rxq in case we're reinitializing it. */
 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
 	rxq_cleanup(rxq_ctrl);
+	/* Move mbuf pointers to dedicated storage area in RX queue. */
+	elts = (void *)(rxq_ctrl + 1);
+	rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
+#ifndef NDEBUG
+	memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
+#endif
+	rte_free(tmpl.rxq.elts);
+	tmpl.rxq.elts = elts;
 	*rxq_ctrl = tmpl;
 	/* Update doorbell counter. */
 	rxq_ctrl->rxq.rq_ci = desc;
@@ -1128,7 +1125,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	assert(ret == 0);
 	return 0;
 error:
+	elts = tmpl.rxq.elts;
 	rxq_cleanup(&tmpl);
+	rte_free(elts);
 	assert(ret > 0);
 	return ret;
 }
-- 
2.1.4

WARNING: multiple messages have this Message-ID
From: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
To: dev@dpdk.org
Cc: Ferruh Yigit <ferruh.yigit@intel.com>,
	Adrien Mazarguil <adrien.mazarguil@6wind.com>,
	Vasily Philipov <vasilyf@mellanox.com>
Subject: [dpdk-dev] [PATCH v5 25/25] mlx5: resurrect Rx scatter support
Date: Thu, 23 Jun 2016 19:05:23 +0200	[thread overview]
Message-ID: <1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com> (raw)
Message-ID: <20160623170523.osARgTUveFNlgrFzflGqsh7qPKEGyH8GY4CLfSFicaM@z> (raw)
In-Reply-To: <1466700801-10383-1-git-send-email-nelio.laranjeiro@6wind.com>

This commit brings back Rx scatter and related support by the MTU update
function. The maximum number of segments per packet is not a fixed value
anymore (previously MLX5_PMD_SGE_WR_N, set to 4 by default) as it caused
performance issues when fewer segments were actually needed as well as
limitations on the maximum packet size that could be received with the
default mbuf size (supporting at most 8576 bytes).

These limitations are now lifted as the number of SGEs is derived from the
MTU (which implies MRU) at queue initialization and during MTU update.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Vasily Philipov <vasilyf@mellanox.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_ethdev.c |  90 ++++++++++++++++++++++----
 drivers/net/mlx5/mlx5_rxq.c    |  77 ++++++++++++++++++-----
 drivers/net/mlx5/mlx5_rxtx.c   | 139 ++++++++++++++++++++++++-----------------
 drivers/net/mlx5/mlx5_rxtx.h   |   1 +
 4 files changed, 225 insertions(+), 82 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 698a50e..72f0826 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -725,6 +725,9 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 	unsigned int i;
 	uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
 		mlx5_rx_burst;
+	unsigned int max_frame_len;
+	int rehash;
+	int restart = priv->started;
 
 	if (mlx5_is_secondary())
 		return -E_RTE_SECONDARY;
@@ -738,7 +741,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 		goto out;
 	} else
 		DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
-	priv->mtu = mtu;
 	/* Temporarily replace RX handler with a fake one, assuming it has not
 	 * been copied elsewhere. */
 	dev->rx_pkt_burst = removed_rx_burst;
@@ -746,28 +748,94 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 	 * removed_rx_burst() instead. */
 	rte_wmb();
 	usleep(1000);
+	/* MTU does not include header and CRC. */
+	max_frame_len = ETHER_HDR_LEN + mtu + ETHER_CRC_LEN;
+	/* Check if at least one queue is going to need a SGE update. */
+	for (i = 0; i != priv->rxqs_n; ++i) {
+		struct rxq *rxq = (*priv->rxqs)[i];
+		unsigned int mb_len;
+		unsigned int size = RTE_PKTMBUF_HEADROOM + max_frame_len;
+		unsigned int sges_n;
+
+		if (rxq == NULL)
+			continue;
+		mb_len = rte_pktmbuf_data_room_size(rxq->mp);
+		assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = log2above((size / mb_len) + !!(size % mb_len));
+		if (sges_n != rxq->sges_n)
+			break;
+	}
+	/*
+	 * If all queues have the right number of SGEs, a simple rehash
+	 * of their buffers is enough, otherwise SGE information can only
+	 * be updated in a queue by recreating it. All resources that depend
+	 * on queues (flows, indirection tables) must be recreated as well in
+	 * that case.
+	 */
+	rehash = (i == priv->rxqs_n);
+	if (!rehash) {
+		/* Clean up everything as with mlx5_dev_stop(). */
+		priv_special_flow_disable_all(priv);
+		priv_mac_addrs_disable(priv);
+		priv_destroy_hash_rxqs(priv);
+		priv_fdir_disable(priv);
+		priv_dev_interrupt_handler_uninstall(priv, dev);
+	}
+recover:
 	/* Reconfigure each RX queue. */
 	for (i = 0; (i != priv->rxqs_n); ++i) {
 		struct rxq *rxq = (*priv->rxqs)[i];
-		unsigned int mb_len;
-		unsigned int max_frame_len;
+		struct rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct rxq_ctrl, rxq);
 		int sp;
+		unsigned int mb_len;
+		unsigned int tmp;
 
 		if (rxq == NULL)
 			continue;
-		/* Calculate new maximum frame length according to MTU and
-		 * toggle scattered support (sp) if necessary. */
-		max_frame_len = (priv->mtu + ETHER_HDR_LEN +
-				 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
 		mb_len = rte_pktmbuf_data_room_size(rxq->mp);
 		assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+		/* Toggle scattered support (sp) if necessary. */
 		sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM));
-		if (sp) {
-			ERROR("%p: RX scatter is not supported", (void *)dev);
-			ret = ENOTSUP;
-			goto out;
+		/* Provide new values to rxq_setup(). */
+		dev->data->dev_conf.rxmode.jumbo_frame = sp;
+		dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
+		if (rehash)
+			ret = rxq_rehash(dev, rxq_ctrl);
+		else
+			ret = rxq_ctrl_setup(dev, rxq_ctrl, rxq->elts_n,
+					     rxq_ctrl->socket, NULL, rxq->mp);
+		if (!ret)
+			continue;
+		/* Attempt to roll back in case of error. */
+		tmp = (mb_len << rxq->sges_n) - RTE_PKTMBUF_HEADROOM;
+		if (max_frame_len != tmp) {
+			max_frame_len = tmp;
+			goto recover;
 		}
+		/* Double fault, disable RX. */
+		break;
+	}
+	/*
+	 * Use a safe RX burst function in case of error, otherwise mimic
+	 * mlx5_dev_start().
+	 */
+	if (ret) {
+		ERROR("unable to reconfigure RX queues, RX disabled");
+		rx_func = removed_rx_burst;
+	} else if (restart &&
+		 !rehash &&
+		 !priv_create_hash_rxqs(priv) &&
+		 !priv_rehash_flows(priv)) {
+		if (dev->data->dev_conf.fdir_conf.mode == RTE_FDIR_MODE_NONE)
+			priv_fdir_enable(priv);
+		priv_dev_interrupt_handler_install(priv, dev);
 	}
+	priv->mtu = mtu;
 	/* Burst functions can now be called again. */
 	rte_wmb();
 	dev->rx_pkt_burst = rx_func;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b2ddd0d..908fd0f 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -644,10 +644,11 @@ static int
 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 	       struct rte_mbuf *(*pool)[])
 {
+	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
 	unsigned int i;
 	int ret = 0;
 
-	/* For each WR (packet). */
+	/* Iterate on segments. */
 	for (i = 0; (i != elts_n); ++i) {
 		struct rte_mbuf *buf;
 		volatile struct mlx5_wqe_data_seg *scat =
@@ -672,6 +673,9 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 		assert(rte_pktmbuf_data_len(buf) == 0);
 		assert(rte_pktmbuf_pkt_len(buf) == 0);
 		assert(!buf->next);
+		/* Only the first segment keeps headroom. */
+		if (i % sges_n)
+			SET_DATA_OFF(buf, 0);
 		PORT(buf) = rxq_ctrl->rxq.port_id;
 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
 		PKT_LEN(buf) = DATA_LEN(buf);
@@ -685,8 +689,8 @@ rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
 		};
 		(*rxq_ctrl->rxq.elts)[i] = buf;
 	}
-	DEBUG("%p: allocated and configured %u single-segment WRs",
-	      (void *)rxq_ctrl, elts_n);
+	DEBUG("%p: allocated and configured %u segments (max %u packets)",
+	      (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
 	assert(ret == 0);
 	return 0;
 error:
@@ -804,7 +808,9 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 	struct ibv_exp_wq_attr mod;
 	int err;
 
-	DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
+	DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
+	      (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
+	assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
 	/* From now on, any failure will render the queue unusable.
 	 * Reinitialize WQ. */
 	mod = (struct ibv_exp_wq_attr){
@@ -837,7 +843,7 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
 		goto error;
 	}
 	/* Update doorbell counter. */
-	rxq_ctrl->rxq.rq_ci = elts_n;
+	rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
 	rte_wmb();
 	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
 error:
@@ -933,9 +939,42 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	int ret = 0;
 
 	(void)conf; /* Thresholds configuration (ignored). */
-	if (desc == 0) {
-		ERROR("%p: invalid number of RX descriptors (must be a"
-		      " multiple of 2)", (void *)dev);
+	/* Enable scattered packets support for this queue if necessary. */
+	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
+	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
+	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
+	     (mb_len - RTE_PKTMBUF_HEADROOM))) {
+		unsigned int size =
+			RTE_PKTMBUF_HEADROOM +
+			dev->data->dev_conf.rxmode.max_rx_pkt_len;
+		unsigned int sges_n;
+
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = log2above((size / mb_len) + !!(size % mb_len));
+		tmpl.rxq.sges_n = sges_n;
+		/* Make sure rxq.sges_n did not overflow. */
+		size = mb_len * (1 << tmpl.rxq.sges_n);
+		size -= RTE_PKTMBUF_HEADROOM;
+		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+			ERROR("%p: too many SGEs (%u) needed to handle"
+			      " requested maximum packet size %u",
+			      (void *)dev,
+			      1 << sges_n,
+			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
+			return EOVERFLOW;
+		}
+	}
+	DEBUG("%p: maximum number of segments per packet: %u",
+	      (void *)dev, 1 << tmpl.rxq.sges_n);
+	if (desc % (1 << tmpl.rxq.sges_n)) {
+		ERROR("%p: number of RX queue descriptors (%u) is not a"
+		      " multiple of SGEs per packet (%u)",
+		      (void *)dev,
+		      desc,
+		      1 << tmpl.rxq.sges_n);
 		return EINVAL;
 	}
 	/* Toggle RX checksum offload if hardware supports it. */
@@ -944,7 +983,6 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	if (priv->hw_csum_l2tun)
 		tmpl.rxq.csum_l2tun =
 			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
-	(void)mb_len; /* I'll be back! */
 	/* Use the entire RX mempool as the memory region. */
 	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
@@ -994,11 +1032,9 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		.wq_context = NULL, /* Could be useful in the future. */
 		.wq_type = IBV_EXP_WQT_RQ,
 		/* Max number of outstanding WRs. */
-		.max_recv_wr = ((priv->device_attr.max_qp_wr < (int)desc) ?
-				priv->device_attr.max_qp_wr :
-				(int)desc),
+		.max_recv_wr = desc >> tmpl.rxq.sges_n,
 		/* Max number of scatter/gather elements in a WR. */
-		.max_recv_sge = 1,
+		.max_recv_sge = 1 << tmpl.rxq.sges_n,
 		.pd = priv->pd,
 		.cq = tmpl.cq,
 		.comp_mask =
@@ -1050,6 +1086,19 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
+	/*
+	 * Make sure number of WRs*SGEs match expectations since a queue
+	 * cannot allocate more than "desc" buffers.
+	 */
+	if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
+	    ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
+		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+		      (void *)dev,
+		      (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
+		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
+		ret = EINVAL;
+		goto error;
+	}
 	/* Save port ID. */
 	tmpl.rxq.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
@@ -1118,7 +1167,7 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
 	tmpl.rxq.elts = elts;
 	*rxq_ctrl = tmpl;
 	/* Update doorbell counter. */
-	rxq_ctrl->rxq.rq_ci = desc;
+	rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
 	rte_wmb();
 	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
 	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 8b67949..d944075 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1548,98 +1548,123 @@ uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct rxq *rxq = dpdk_rxq;
-	unsigned int pkts_ret = 0;
-	unsigned int i;
-	unsigned int rq_ci = rxq->rq_ci;
-	const unsigned int elts_n = rxq->elts_n;
-	const unsigned int wqe_cnt = elts_n - 1;
+	const unsigned int wqe_cnt = rxq->elts_n - 1;
 	const unsigned int cqe_cnt = rxq->cqe_n - 1;
+	const unsigned int sges_n = rxq->sges_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	volatile struct mlx5_cqe64 *cqe =
+		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+	unsigned int i = 0;
+	unsigned int rq_ci = rxq->rq_ci << sges_n;
+	int len;
 
-	for (i = 0; (i != pkts_n); ++i) {
+	while (pkts_n) {
 		unsigned int idx = rq_ci & wqe_cnt;
-		int len;
-		struct rte_mbuf *rep;
-		struct rte_mbuf *pkt;
 		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
-		volatile struct mlx5_cqe64 *cqe =
-			&(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
 
-		pkt = (*rxq->elts)[idx];
+		if (pkt)
+			NEXT(seg) = rep;
+		seg = rep;
+		rte_prefetch0(seg);
 		rte_prefetch0(cqe);
+		rte_prefetch0(wqe);
 		rep = rte_mbuf_raw_alloc(rxq->mp);
 		if (unlikely(rep == NULL)) {
+			while (pkt) {
+				seg = NEXT(pkt);
+				rte_mbuf_refcnt_set(pkt, 0);
+				__rte_mbuf_raw_free(pkt);
+				pkt = seg;
+			}
 			++rxq->stats.rx_nombuf;
 			break;
 		}
-		SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
-		NB_SEGS(rep) = 1;
-		PORT(rep) = rxq->port_id;
-		NEXT(rep) = NULL;
-		len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
-		if (unlikely(len == 0)) {
-			rte_mbuf_refcnt_set(rep, 0);
-			__rte_mbuf_raw_free(rep);
-			break;
-		}
-		if (unlikely(len == -1)) {
-			/* RX error, packet is likely too large. */
-			rte_mbuf_refcnt_set(rep, 0);
-			__rte_mbuf_raw_free(rep);
-			++rxq->stats.idropped;
-			--i;
-			goto skip;
+		if (!pkt) {
+			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
+			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
+			if (len == 0) {
+				rte_mbuf_refcnt_set(rep, 0);
+				__rte_mbuf_raw_free(rep);
+				break;
+			}
+			if (unlikely(len == -1)) {
+				/* RX error, packet is likely too large. */
+				rte_mbuf_refcnt_set(rep, 0);
+				__rte_mbuf_raw_free(rep);
+				++rxq->stats.idropped;
+				goto skip;
+			}
+			pkt = seg;
+			assert(len >= (rxq->crc_present << 2));
+			/* Update packet information. */
+			if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+			    rxq->crc_present) {
+				if (rxq->csum) {
+					pkt->packet_type =
+						rxq_cq_to_pkt_type(cqe);
+					pkt->ol_flags =
+						rxq_cq_to_ol_flags(rxq, cqe);
+				}
+				if (cqe->l4_hdr_type_etc &
+				    MLX5_CQE_VLAN_STRIPPED) {
+					pkt->ol_flags |= PKT_RX_VLAN_PKT;
+					pkt->vlan_tci = ntohs(cqe->vlan_info);
+				}
+				if (rxq->crc_present)
+					len -= ETHER_CRC_LEN;
+			}
+			PKT_LEN(pkt) = len;
 		}
+		DATA_LEN(rep) = DATA_LEN(seg);
+		PKT_LEN(rep) = PKT_LEN(seg);
+		SET_DATA_OFF(rep, DATA_OFF(seg));
+		NB_SEGS(rep) = NB_SEGS(seg);
+		PORT(rep) = PORT(seg);
+		NEXT(rep) = NULL;
+		(*rxq->elts)[idx] = rep;
 		/*
 		 * Fill NIC descriptor with the new buffer.  The lkey and size
 		 * of the buffers are already known, only the buffer address
 		 * changes.
 		 */
-		wqe->addr = htonll((uintptr_t)rep->buf_addr +
-				   RTE_PKTMBUF_HEADROOM);
-		(*rxq->elts)[idx] = rep;
-		/* Update pkt information. */
-		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
-		    rxq->crc_present) {
-			if (rxq->csum) {
-				pkt->packet_type = rxq_cq_to_pkt_type(cqe);
-				pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
-			}
-			if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
-				pkt->ol_flags |= PKT_RX_VLAN_PKT;
-				pkt->vlan_tci = ntohs(cqe->vlan_info);
-			}
-			if (rxq->crc_present)
-				len -= ETHER_CRC_LEN;
+		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+		if (len > DATA_LEN(seg)) {
+			len -= DATA_LEN(seg);
+			++NB_SEGS(pkt);
+			++rq_ci;
+			continue;
 		}
-		PKT_LEN(pkt) = len;
-		DATA_LEN(pkt) = len;
+		DATA_LEN(seg) = len;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment bytes counter. */
-		rxq->stats.ibytes += len;
+		rxq->stats.ibytes += PKT_LEN(pkt);
 #endif
 		/* Return packet. */
 		*(pkts++) = pkt;
-		++pkts_ret;
+		pkt = NULL;
+		--pkts_n;
+		++i;
 skip:
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sges_n;
 		++rq_ci;
+		rq_ci <<= sges_n;
 	}
-	if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
+	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
 		return 0;
-	/* Repost WRs. */
-#ifdef DEBUG_RECV
-	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
-#endif
 	/* Update the consumer index. */
-	rxq->rq_ci = rq_ci;
+	rxq->rq_ci = rq_ci >> sges_n;
 	rte_wmb();
 	*rxq->cq_db = htonl(rxq->cq_ci);
 	rte_wmb();
 	*rxq->rq_db = htonl(rxq->rq_ci);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	/* Increment packets counter. */
-	rxq->stats.ipackets += pkts_ret;
+	rxq->stats.ipackets += i;
 #endif
-	return pkts_ret;
+	return i;
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 41605f9..f6e2cba 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -106,6 +106,7 @@ struct rxq {
 	unsigned int csum_l2tun:1; /* Same for L2 tunnels. */
 	unsigned int vlan_strip:1; /* Enable VLAN stripping. */
 	unsigned int crc_present:1; /* CRC must be subtracted. */
+	unsigned int sges_n:2; /* Log 2 of SGEs (max buffers per packet). */
 	uint16_t rq_ci;
 	uint16_t cq_ci;
 	uint16_t elts_n;
-- 
2.1.4

  parent reply	other threads:[~2016-06-23 16:54 UTC|newest]

Thread overview: 211+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-08  9:47 [dpdk-dev] [PATCH 00/24] Refactor mlx5 to improve performance Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 01/24] mlx5: split memory registration function for better performance Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 02/24] mlx5: remove TX gather support Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 03/24] mlx5: remove RX scatter support Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 04/24] mlx5: remove configuration variable for maximum number of segments Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 05/24] mlx5: remove inline TX support Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 06/24] mlx5: split TX queue structure Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 07/24] mlx5: split RX " Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 08/24] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 09/24] mlx5: add definitions for data path without Verbs Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 10/24] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 11/24] mlx5: add TX/RX burst function selection wrapper Nelio Laranjeiro
2016-06-08  9:47 ` [dpdk-dev] [PATCH 12/24] mlx5: refactor RX data path Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 13/24] mlx5: refactor TX " Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 14/24] mlx5: handle RX CQE compression Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 15/24] mlx5: replace countdown with threshold for TX completions Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 16/24] mlx5: add support for inline send Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 17/24] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 18/24] mlx5: add debugging information about TX queues capabilities Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 19/24] mlx5: check remaining space while processing TX burst Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 20/24] mlx5: resurrect TX gather support Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 21/24] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 22/24] mlx5: remove redundant RX queue initialization code Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 23/24] mlx5: make RX queue reinitialization safer Nelio Laranjeiro
2016-06-08  9:48 ` [dpdk-dev] [PATCH 24/24] mlx5: resurrect RX scatter support Nelio Laranjeiro
2016-06-13 18:50 ` [dpdk-dev] [PATCH 00/24] Refactor mlx5 to improve performance Javier Blazquez
2016-06-14  6:57   ` Nélio Laranjeiro
2016-06-17 16:09 ` Ferruh Yigit
2016-06-20  7:38   ` Nélio Laranjeiro
2016-06-20 15:03     ` Ferruh Yigit
2016-06-20 15:11       ` Nélio Laranjeiro
2016-06-20 16:10 ` [dpdk-dev] [PATCH v2 00/25] " Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 01/25] drivers: fix PCI class id support Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 02/25] mlx5: split memory registration function Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 03/25] mlx5: remove Tx gather support Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 04/25] mlx5: remove Rx scatter support Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 05/25] mlx5: remove configuration variable Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 06/25] mlx5: remove inline Tx support Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 07/25] mlx5: split Tx queue structure Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 08/25] mlx5: split Rx " Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 09/25] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 10/25] mlx5: add definitions for data path without Verbs Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 11/25] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 12/25] mlx5: add Tx/Rx burst function selection wrapper Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 13/25] mlx5: refactor Rx data path Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 14/25] mlx5: refactor Tx " Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 15/25] mlx5: handle Rx CQE compression Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 16/25] mlx5: replace countdown with threshold for Tx completions Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 17/25] mlx5: add support for inline send Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 18/25] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 19/25] mlx5: add debugging information about Tx queues capabilities Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 20/25] mlx5: check remaining space while processing Tx burst Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 21/25] mlx5: resurrect Tx gather support Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 22/25] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 23/25] mlx5: remove redundant Rx queue initialization code Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 24/25] mlx5: make Rx queue reinitialization safer Nelio Laranjeiro
2016-06-20 16:10   ` [dpdk-dev] [PATCH v2 25/25] mlx5: resurrect Rx scatter support Nelio Laranjeiro
2016-06-20 17:01   ` [dpdk-dev] [PATCH v2 00/25] Refactor mlx5 to improve performance Ferruh Yigit
2016-06-21  6:44     ` Nélio Laranjeiro
2016-06-21  7:23 ` [dpdk-dev] [PATCH v3 " Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 01/25] drivers: fix PCI class id support Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 02/25] mlx5: split memory registration function Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 03/25] mlx5: remove Tx gather support Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 04/25] mlx5: remove Rx scatter support Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 05/25] mlx5: remove configuration variable Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 06/25] mlx5: remove inline Tx support Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 07/25] mlx5: split Tx queue structure Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 08/25] mlx5: split Rx " Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 09/25] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 10/25] mlx5: add definitions for data path without Verbs Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 11/25] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-21 16:42     ` Ferruh Yigit
2016-06-22  7:30       ` Nélio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 12/25] mlx5: add Tx/Rx burst function selection wrapper Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 13/25] mlx5: refactor Rx data path Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 14/25] mlx5: refactor Tx " Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 15/25] mlx5: handle Rx CQE compression Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 16/25] mlx5: replace countdown with threshold for Tx completions Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 17/25] mlx5: add support for inline send Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 18/25] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 19/25] mlx5: add debugging information about Tx queues capabilities Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 20/25] mlx5: check remaining space while processing Tx burst Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 21/25] mlx5: resurrect Tx gather support Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 22/25] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 23/25] mlx5: remove redundant Rx queue initialization code Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 24/25] mlx5: make Rx queue reinitialization safer Nelio Laranjeiro
2016-06-21  7:23   ` [dpdk-dev] [PATCH v3 25/25] mlx5: resurrect Rx scatter support Nelio Laranjeiro
2016-06-21  7:43   ` [dpdk-dev] [PATCH v3 00/25] Refactor mlx5 to improve performance Yuanhan Liu
2016-06-21  8:00     ` Nélio Laranjeiro
2016-06-21  8:05       ` Yuanhan Liu
2016-06-21  8:49         ` Nélio Laranjeiro
2016-06-21 10:44     ` Ferruh Yigit
2016-06-21 12:26       ` Thomas Monjalon
2016-06-21 16:42   ` Ferruh Yigit
2016-06-22  8:20     ` Adrien Mazarguil
2016-06-22  9:19       ` Bruce Richardson
2016-06-22  9:30         ` Adrien Mazarguil
2016-06-23 15:14           ` Adrien Mazarguil
2016-06-22  9:05   ` [dpdk-dev] [PATCH " Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 01/25] drivers: fix PCI class id support Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 02/25] mlx5: split memory registration function Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 03/25] mlx5: remove Tx gather support Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 04/25] mlx5: remove Rx scatter support Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 05/25] mlx5: remove configuration variable Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 06/25] mlx5: remove inline Tx support Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 07/25] mlx5: split Tx queue structure Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 08/25] mlx5: split Rx " Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 09/25] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 10/25] mlx5: add definitions for data path without Verbs Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 11/25] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 12/25] mlx5: add Tx/Rx burst function selection wrapper Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 13/25] mlx5: refactor Rx data path Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 14/25] mlx5: refactor Tx " Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 15/25] mlx5: handle Rx CQE compression Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 16/25] mlx5: replace countdown with threshold for Tx completions Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 17/25] mlx5: add support for inline send Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 18/25] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 19/25] mlx5: add debugging information about Tx queues capabilities Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 20/25] mlx5: check remaining space while processing Tx burst Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 21/25] mlx5: resurrect Tx gather support Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 22/25] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 23/25] mlx5: remove redundant Rx queue initialization code Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 24/25] mlx5: make Rx queue reinitialization safer Nelio Laranjeiro
2016-06-22  9:05     ` [dpdk-dev] [PATCH v4 25/25] mlx5: resurrect Rx scatter support Nelio Laranjeiro
2016-06-23 16:52     ` [dpdk-dev] [PATCH v5 00/25] Refactor mlx5 to improve performance Nelio Laranjeiro
2016-06-23 16:52       ` [dpdk-dev] [PATCH v5 01/25] drivers: fix PCI class id support Nelio Laranjeiro
2016-06-23 16:52       ` [dpdk-dev] [PATCH v5 02/25] mlx5: split memory registration function Nelio Laranjeiro
2016-06-23 16:52       ` [dpdk-dev] [PATCH v5 03/25] mlx5: remove Tx gather support Nelio Laranjeiro
2016-06-23 16:53       ` [dpdk-dev] [PATCH v5 04/25] mlx5: remove Rx scatter support Nelio Laranjeiro
2016-06-23 16:53       ` [dpdk-dev] [PATCH v5 05/25] mlx5: remove configuration variable Nelio Laranjeiro
2016-06-23 16:53       ` [dpdk-dev] [PATCH v5 06/25] mlx5: remove inline Tx support Nelio Laranjeiro
2016-06-23 16:53       ` [dpdk-dev] [PATCH v5 07/25] mlx5: split Tx queue structure Nelio Laranjeiro
2016-06-23 16:53       ` [dpdk-dev] [PATCH v5 08/25] mlx5: split Rx " Nelio Laranjeiro
2016-06-23 16:53       ` [dpdk-dev] [PATCH v5 09/25] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-23 16:53       ` Nelio Laranjeiro [this message]
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 11/25] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 12/25] mlx5: add Tx/Rx burst function selection wrapper Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 13/25] mlx5: refactor Rx data path Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 14/25] mlx5: refactor Tx " Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 15/25] mlx5: handle Rx CQE compression Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 16/25] mlx5: replace countdown with threshold for Tx completions Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 17/25] mlx5: add support for inline send Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 18/25] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 19/25] mlx5: add debugging information about Tx queues capabilities Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 20/25] mlx5: check remaining space while processing Tx burst Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 21/25] mlx5: resurrect Tx gather support Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 22/25] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 23/25] mlx5: remove redundant Rx queue initialization code Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 24/25] mlx5: make Rx queue reinitialization safer Nelio Laranjeiro
2016-06-23 17:05         ` [dpdk-dev] [PATCH v5 25/25] mlx5: resurrect Rx scatter support Nelio Laranjeiro
2016-06-23 17:11       ` [dpdk-dev] [PATCH v5 00/25] Refactor mlx5 to improve performance Nélio Laranjeiro
2016-06-24  8:50       ` [dpdk-dev] [PATCH v6 " Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 01/25] drivers: fix PCI class id support Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 02/25] mlx5: split memory registration function Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 03/25] mlx5: remove Tx gather support Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 04/25] mlx5: remove Rx scatter support Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 05/25] mlx5: remove configuration variable Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 06/25] mlx5: remove inline Tx support Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 07/25] mlx5: split Tx queue structure Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 08/25] mlx5: split Rx " Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 09/25] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 10/25] mlx5: add definitions for data path without Verbs Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 11/25] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 12/25] mlx5: add Tx/Rx burst function selection wrapper Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 13/25] mlx5: refactor Rx data path Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 14/25] mlx5: refactor Tx " Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 15/25] mlx5: handle Rx CQE compression Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 16/25] mlx5: replace countdown with threshold for Tx completions Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 17/25] mlx5: add support for inline send Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 18/25] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 19/25] mlx5: add debugging information about Tx queues capabilities Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 20/25] mlx5: check remaining space while processing Tx burst Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 21/25] mlx5: resurrect Tx gather support Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 22/25] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-24  8:50         ` [dpdk-dev] [PATCH v6 23/25] mlx5: remove redundant Rx queue initialization code Nelio Laranjeiro
2016-06-24  8:51         ` [dpdk-dev] [PATCH v6 24/25] mlx5: make Rx queue reinitialization safer Nelio Laranjeiro
2016-06-24  8:51         ` [dpdk-dev] [PATCH v6 25/25] mlx5: resurrect Rx scatter support Nelio Laranjeiro
2016-06-24 13:04         ` [dpdk-dev] [PATCH v6 00/25] Refactor mlx5 to improve performance Nélio Laranjeiro
2016-06-24 13:17         ` [dpdk-dev] [PATCH v7 " Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 01/25] drivers: fix PCI class id support Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 02/25] mlx5: split memory registration function Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 03/25] mlx5: remove Tx gather support Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 04/25] mlx5: remove Rx scatter support Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 05/25] mlx5: remove configuration variable Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 06/25] mlx5: remove inline Tx support Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 07/25] mlx5: split Tx queue structure Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 08/25] mlx5: split Rx " Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 09/25] mlx5: update prerequisites for upcoming enhancements Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 10/25] mlx5: add definitions for data path without Verbs Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 11/25] mlx5: add support for configuration through kvargs Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 12/25] mlx5: add Tx/Rx burst function selection wrapper Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 13/25] mlx5: refactor Rx data path Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 14/25] mlx5: refactor Tx " Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 15/25] mlx5: handle Rx CQE compression Nelio Laranjeiro
2016-06-27 12:03             ` Bruce Richardson
2016-06-27 12:22               ` Nélio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 16/25] mlx5: replace countdown with threshold for Tx completions Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 17/25] mlx5: add support for inline send Nelio Laranjeiro
2016-06-27 12:17             ` Bruce Richardson
2016-06-27 12:24               ` Nélio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 18/25] mlx5: add support for multi-packet send Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 19/25] mlx5: add debugging information about Tx queues capabilities Nelio Laranjeiro
2016-06-24 13:17           ` [dpdk-dev] [PATCH v7 20/25] mlx5: check remaining space while processing Tx burst Nelio Laranjeiro
2016-06-24 13:18           ` [dpdk-dev] [PATCH v7 21/25] mlx5: resurrect Tx gather support Nelio Laranjeiro
2016-06-24 13:18           ` [dpdk-dev] [PATCH v7 22/25] mlx5: work around spurious compilation errors Nelio Laranjeiro
2016-06-27 12:12             ` Bruce Richardson
2016-06-27 12:27               ` Adrien Mazarguil
2016-06-24 13:18           ` [dpdk-dev] [PATCH v7 23/25] mlx5: remove redundant Rx queue initialization code Nelio Laranjeiro
2016-06-24 13:18           ` [dpdk-dev] [PATCH v7 24/25] mlx5: make Rx queue reinitialization safer Nelio Laranjeiro
2016-06-24 13:18           ` [dpdk-dev] [PATCH v7 25/25] mlx5: resurrect Rx scatter support Nelio Laranjeiro
2016-06-27 12:31           ` [dpdk-dev] [PATCH v7 00/25] Refactor mlx5 to improve performance Bruce Richardson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1466700801-10383-11-git-send-email-nelio.laranjeiro@6wind.com \
    --to=nelio.laranjeiro@6wind.com \
    --cc=adrien.mazarguil@6wind.com \
    --cc=dev@dpdk.org \
    --cc=ferruh.yigit@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).